|
|
import base64 |
|
|
import os |
|
|
import io |
|
|
import json |
|
|
from fastapi import FastAPI, HTTPException, File, UploadFile |
|
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
from PyPDF2 import PdfReader |
|
|
from PIL import Image |
|
|
import fitz |
|
|
import openai |
|
|
import pytesseract |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
|
|
|
|
if not openai.api_key: |
|
|
raise RuntimeError("Missing OpenAI API key. Please set OPENAI_API_KEY in the environment variables.") |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
app.add_middleware( |
|
|
CORSMiddleware, |
|
|
allow_origins=["*"], |
|
|
allow_credentials=True, |
|
|
allow_methods=["*"], |
|
|
allow_headers=["*"], |
|
|
) |
|
|
|
|
|
|
|
|
def vision(file_content): |
|
|
"""Extract text from images inside a PDF using PyMuPDF & OCR.""" |
|
|
pdf_document = fitz.open(stream=file_content, filetype="pdf") |
|
|
base64_images = [] |
|
|
vision_data = [{"type": "text", "text": "Extract all text from these images."}] |
|
|
|
|
|
for page_num in range(len(pdf_document)): |
|
|
page = pdf_document.load_page(page_num) |
|
|
pix = page.get_pixmap() |
|
|
|
|
|
|
|
|
img = Image.open(io.BytesIO(pix.tobytes("png"))) |
|
|
|
|
|
|
|
|
buffered = io.BytesIO() |
|
|
img.save(buffered, format="PNG") |
|
|
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") |
|
|
base64_images.append(img_base64) |
|
|
|
|
|
vision_data.append({ |
|
|
"type": "image_url", |
|
|
"image_url": {"url": f"data:image/png;base64,{img_base64}"}, |
|
|
}) |
|
|
|
|
|
print("PDF pages converted to images successfully!") |
|
|
|
|
|
|
|
|
try: |
|
|
response = openai.ChatCompletion.create( |
|
|
model="gpt-4o", |
|
|
messages=[{"role": "user", "content": vision_data}], |
|
|
) |
|
|
return response["choices"][0]["message"]["content"] |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error in GPT-4o vision processing: {str(e)}") |
|
|
|
|
|
|
|
|
@app.post("/get_ocr_data/") |
|
|
def get_data(input_file: UploadFile = File(...)): |
|
|
"""Extract structured data from a PDF resume.""" |
|
|
try: |
|
|
|
|
|
file_content = input_file.file.read() |
|
|
file_type = input_file.content_type |
|
|
extracted_text = "" |
|
|
|
|
|
if file_type == "application/pdf": |
|
|
pdf_reader = PdfReader(io.BytesIO(file_content)) |
|
|
|
|
|
for page in pdf_reader.pages: |
|
|
text = page.extract_text() |
|
|
if text: |
|
|
extracted_text += text + "\n" |
|
|
|
|
|
if not extracted_text.strip(): |
|
|
print("\nVision OCR running...\n") |
|
|
extracted_text = vision(file_content) |
|
|
|
|
|
else: |
|
|
raise HTTPException(status_code=400, detail="Unsupported file type") |
|
|
|
|
|
print("Extracted Text:\n", extracted_text.strip()) |
|
|
|
|
|
|
|
|
prompt = f"""This is CV data: {extracted_text.strip()}. |
|
|
IMPORTANT: The output should be a JSON array! Make sure the JSON is valid. |
|
|
If no data is found, fill missing fields with "none". Do not include extra explanation text. |
|
|
|
|
|
Example Output: |
|
|
```json |
|
|
{{ |
|
|
"firstname": "First Name", |
|
|
"lastname": "Last Name", |
|
|
"email": "Email Address", |
|
|
"contact_number": "Contact Number", |
|
|
"home_address": "Full Home Address", |
|
|
"home_town": "Home Town or City", |
|
|
"total_years_of_experience": "Total Years of Experience", |
|
|
"education": "Institution Name, Degree Name", |
|
|
"LinkedIn_link": "LinkedIn URL", |
|
|
"experience": "Job Title, Start Date - End Date, Company Name; Job Title, Start Date - End Date, Company Name; Job Title, Start Date - End Date, Company Name", |
|
|
"industry": "industry of work", |
|
|
"skills": "Skill 1, Skill 2, Skill 3", |
|
|
"positions": ["Job Title 1", "Job Title 2"], |
|
|
"summary": "Summary of qualifications and experience" |
|
|
}} |
|
|
```""" |
|
|
|
|
|
response = openai.ChatCompletion.create( |
|
|
model="gpt-4o", |
|
|
messages=[ |
|
|
{"role": "system", "content": "You are an assistant that processes CV data into structured JSON."}, |
|
|
{"role": "user", "content": prompt} |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
json_response = response["choices"][0]["message"]["content"].replace("json", "").replace("```", "").strip() |
|
|
structured_data = json.loads(json_response) |
|
|
|
|
|
return {"data": structured_data} |
|
|
|
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}") |
|
|
|
|
|
|
|
|
|