import base64 import litellm from loguru import logger from celery import shared_task from django.db import transaction from django.core.files.storage import default_storage from django.conf import settings from .models import ImageMemo, MemoStatus TRANSCRIBE_PROMPT = """Transcribe the hand written notes in the attached image and present them as markdown. Do not use a fence, simply respond using markdown. If any words or letters are unclear, denote them with a '??'. For example if you were not sure whether a word is blow or blew you would transcribe it as '?blow?' If a text is underlined followed by a newline that indicates that it is a header. Use markdown H2 to denote it as such. Make sure to add 2 newlines newlines between sections. Anything that looks visually like a bullet point should be treated as such. This includes lines starting with hyphens. Replace bullet point indicators with * in the interpretted text. Please include whitespace and formatting for headings too. """ @shared_task def process_memo(memo_id: str): """Run OCR on a memo and store the output""" logger.info(f"Looking up memo with id={memo_id}") memo = ImageMemo.objects.get(id=memo_id) with transaction.atomic(): logger.info(f"Set status=processing for memo {memo.id}") memo.status = MemoStatus.Processing memo.save() # check that the image exists logger.info(f"Checking that image {memo.image.name} exists") if not default_storage.exists(memo.image.name): memo.status = MemoStatus.Error memo.error_message = f"Image file {memo.image.name} does not exist" memo.save() return # read the image into memory logger.info(f"Reading image {memo.image.name}") bytearray = default_storage.open(memo.image.name).read() # call the OCR API logger.info(f"Calling OCR API for memo {memo.id}") b64img = base64.b64encode(bytearray).decode("utf-8") message = { "role": "user", "content": [ {"type": "text", "text": TRANSCRIBE_PROMPT}, { "type": "image_url", "image_url": {"url": f"data:{memo.image_mimetype};base64,{b64img}"}, }, ], } litellm.api_base = settings.OPENAI_API_BASE # os.environ.get("OPENAI_API_BASE") litellm.api_key = settings.OPENAI_API_KEY response = litellm.completion( model=settings.OPENAI_MODEL, #os.getenv("MODEL", "openai/gpt-4o"), messages=[message], temperature=0.01 ) response.choices[0].message["content"] with transaction.atomic(): memo.content = response.choices[0].message["content"] memo.status = MemoStatus.Done memo.save()