Spaces:

Nasma
/

ocrgpt

Sleeping

App Files Files Community

ocrgpt / main.py

Nasma

Update main.py

f9ae632 verified 9 months ago

raw

history blame

4.79 kB

	import base64
	import os
	import io
	import json
	from fastapi import FastAPI, HTTPException, File, UploadFile
	from fastapi.middleware.cors import CORSMiddleware
	from PyPDF2 import PdfReader
	from PIL import Image
	import fitz # PyMuPDF
	import openai
	import pytesseract
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()
	openai.api_key = os.getenv("OPENAI_API_KEY")

	if not openai.api_key:
	raise RuntimeError("Missing OpenAI API key. Please set OPENAI_API_KEY in the environment variables.")

	app = FastAPI()

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	def vision(file_content):
	"""Extract text from images inside a PDF using PyMuPDF & OCR."""
	pdf_document = fitz.open(stream=file_content, filetype="pdf")
	base64_images = []
	vision_data = [{"type": "text", "text": "Extract all text from these images."}]

	for page_num in range(len(pdf_document)):
	page = pdf_document.load_page(page_num)
	pix = page.get_pixmap()

	# Convert the image to a PIL image
	img = Image.open(io.BytesIO(pix.tobytes("png")))

	# Convert the image to base64
	buffered = io.BytesIO()
	img.save(buffered, format="PNG")
	img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
	base64_images.append(img_base64)

	vision_data.append({
	"type": "image_url",
	"image_url": {"url": f"data:image/png;base64,{img_base64}"},
	})

	print("PDF pages converted to images successfully!")

	# Send images to GPT-4o for processing
	try:
	response = openai.ChatCompletion.create(
	model="gpt-4o",
	messages=[{"role": "user", "content": vision_data}],
	)
	return response["choices"][0]["message"]["content"]
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error in GPT-4o vision processing: {str(e)}")


	@app.post("/get_ocr_data/")
	def get_data(input_file: UploadFile = File(...)):
	"""Extract structured data from a PDF resume."""
	try:
	# Read the uploaded file
	file_content = input_file.file.read()
	file_type = input_file.content_type
	extracted_text = ""

	if file_type == "application/pdf":
	pdf_reader = PdfReader(io.BytesIO(file_content))

	for page in pdf_reader.pages:
	text = page.extract_text()
	if text:
	extracted_text += text + "\n"

	if not extracted_text.strip(): # If no text found, use vision processing
	print("\nVision OCR running...\n")
	extracted_text = vision(file_content)

	else:
	raise HTTPException(status_code=400, detail="Unsupported file type")

	print("Extracted Text:\n", extracted_text.strip())

	# Call GPT-4o to structure extracted text into JSON format
	prompt = f"""This is CV data: {extracted_text.strip()}.
	IMPORTANT: The output should be a JSON array! Make sure the JSON is valid.
	If no data is found, fill missing fields with "none". Do not include extra explanation text.

	Example Output:
	```json
	{{
	"firstname": "First Name",
	"lastname": "Last Name",
	"email": "Email Address",
	"contact_number": "Contact Number",
	"home_address": "Full Home Address",
	"home_town": "Home Town or City",
	"total_years_of_experience": "Total Years of Experience",
	"education": "Institution Name, Degree Name",
	"LinkedIn_link": "LinkedIn URL",
	"experience": "Job Title, Start Date - End Date, Company Name; Job Title, Start Date - End Date, Company Name; Job Title, Start Date - End Date, Company Name",
	"industry": "industry of work",
	"skills": "Skill 1, Skill 2, Skill 3",
	"positions": ["Job Title 1", "Job Title 2"],
	"summary": "Summary of qualifications and experience"
	}}
	```"""

	response = openai.ChatCompletion.create(
	model="gpt-4o",
	messages=[
	{"role": "system", "content": "You are an assistant that processes CV data into structured JSON."},
	{"role": "user", "content": prompt}
	]
	)

	# Ensure valid JSON output
	json_response = response["choices"][0]["message"]["content"].replace("json", "").replace("```", "").strip()
	structured_data = json.loads(json_response)

	return {"data": structured_data}

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")