from tqdm import tqdm import re import gradio as gr import os import accelerate # import spaces import subprocess from huggingface_hub import hf_hub_download from llama_cpp import Llama from docling.document_converter import DocumentConverter from huggingface_hub import login login(token = os.getenv('HF_TOKEN')) repo_id = "SyntheticIAI/CVCRaft" model_id = "fine_tuned_llama.gguf" hf_hub_download( repo_id=repo_id, filename=model_id, local_dir = "./models" ) def process_document(pdf_path): extracted_pages = extract_pages(pdf_path) page2content = {} for extracted_page in tqdm(extracted_pages): page_id = extracted_page.pageid content = process_page(extracted_page) page2content[page_id] = content return page2content def process_page(extracted_page): content = [] elements = [element for element in extracted_page._objs] elements.sort(key=lambda a: a.y1, reverse=True) for i, element in enumerate(elements): if isinstance(element, LTTextContainer): line_text = extract_text_and_normalize(element) content.append(line_text) content = re.sub('\n+', '\n', ''.join(content)) return content def extract_text_and_normalize(element): # Extract text from line and split it with new lines line_texts = element.get_text().split('\n') norm_text = '' for line_text in line_texts: line_text = line_text.strip() if not line_text: line_text = '\n' else: line_text = re.sub('\s+', ' ', line_text) if not re.search('[\w\d\,\-]', line_text[-1]): line_text += '\n' else: line_text += ' ' norm_text += line_text return norm_text def txt_to_html(text): html_content = "" for line in text.split('\n'): html_content += "

{}

".format(line.strip()) html_content += "" return html_content def craft_cv(llm, prompt, maxtokens, temperature, top_probability): # def craft_cv(llm, cv_text, job_description, maxtokens, temperature, top_probability): instruction = "Given input CV and job description. Please revise the CV according to the given job description and output the revised CV." output = llm.create_chat_completion( messages=[ # {"from": "user", "value": instruction + ' Input CV: ' + cv_text + ' , Job Description: ' + job_description}, {"from": "user", "value": prompt}, ], max_tokens=maxtokens, temperature=temperature ) output = output['choices'][0]['message']['content'] cv_text='' return cv_text, output def convert_to_json(llm, cv_text, maxtokens, temperature, top_probability): json_format = """ You are an expert at structuring resumes in JSON format. Given a modified resume text, extract the relevant details and convert them into the following structured JSON format: { "profileDetails": { "firstName": "", "lastName": "", "email": "", "contact": "", "country": "", "jobTitle": "", "social": "", "profileDesc": "", "address": "", "city": "", "state": "", "zipCode": "" }, "professionalExperience": [ { "positionTitle": "", "location": "", "company": "", "description": "", "startDate": "", "endDate": "" } ], "education": [ { "institute": "", "schoolLocation": "", "degree": "", "field": "", "grade": "", "startDate": "", "endDate": "" } ], "skills": [""], "hobbies": [""], "languages": [""], "certifications": [""], "projects": [ { "title": "", "description": "" } ], "jobPreferences": { "compTarget": "", "strength": "", "roleTarget": "" }, "jobDescription": "" } Instructions: - Extract details accurately from the given resume. - Ensure proper structuring of dates, responsibilities, and projects. - If a field is missing in the input, leave it as an empty string or an empty list where applicable. - Maintain proper formatting and avoid unnecessary additions. Provide the response in a valid JSON format with no additional explanations. """ output = llm.create_chat_completion( messages=[ {"from": "user", "value": json_format + ' CV text: ' + cv_text}, ], max_tokens=maxtokens, temperature=temperature ) output = output['choices'][0]['message']['content'] return output def pdf_to_text(prompt, maxtokens=2048, temperature=0, top_probability=0.95): # def pdf_to_text(cv_file, job_description, llm_type='Fine tuned Llama3', maxtokens=2048, temperature=0, top_probability=0.95): # converter = DocumentConverter() # result = converter.convert(cv_file) # cv_text = result.document.export_to_markdown() # if(llm_type=='Fine tuned Llama3'): llm = Llama( model_path="models/" + model_id, flash_attn=True, n_gpu_layers=81, n_batch=1024, n_ctx=8192, ) print('MAX TONENS IS ',maxtokens) # cv_text, crafted_cv = craft_cv(llm, cv_text, job_description, maxtokens, temperature, top_probability) # print('CRAFTED CV IS ',crafted_cv) cv_text, crafted_cv = craft_cv(llm, prompt, maxtokens, temperature, top_probability) crafted_cv = convert_to_json(llm, crafted_cv, maxtokens, temperature, top_probability) # print('FINAL CV IS ',crafted_cv) return crafted_cv temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value") prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value") max_tokens = gr.Number(value=600, label="Max Tokens") cv_file = gr.File(label='Upload the CV') prompt_text = gr.Textbox(label='Enter the job description') output_text = gr.Textbox() llm_type = gr.Radio(["Fine tuned Llama3"]) iface = gr.Interface( fn=pdf_to_text, # inputs=[cv_file, prompt_text, llm_type], inputs=['text'], outputs=['text'], title='Craft CV', description="This application assists to customize CV based on input job description", theme=gr.themes.Soft(), ) iface.launch()