VoyageX

Sleeping

App Files Files Community

srijaydeshpande commited on Mar 27

Commit

0c4fe37

verified ·

1 Parent(s): 8b2ea04

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -157

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from pdfminer.high_level import extract_pages
-from pdfminer.layout import LTTextContainer
 import re
 import gradio as gr
 import os
@@ -10,15 +8,14 @@ import subprocess
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 from huggingface_hub import login
-# from docling.document_converter import DocumentConverter
 login(token = os.getenv('HF_TOKEN'))
-# repo_id = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"
-# model_id = "Meta-Llama-3-8B-Instruct-v2.Q2_K.gguf"
-repo_id = "QuantFactory/Meta-Llama-3-70B-Instruct-GGUF"
-model_id = "Meta-Llama-3-70B-Instruct.Q2_K.gguf"
 local_dir = "models"
@@ -28,175 +25,42 @@ hf_hub_download(
     local_dir = local_dir
 )
-def process_document(pdf_path):
-    extracted_pages = extract_pages(pdf_path)
-    page2content = {}
-    for extracted_page in tqdm(extracted_pages):
-        page_id = extracted_page.pageid
-        content = process_page(extracted_page)
-        page2content[page_id] = content
-    return page2content
-def process_page(extracted_page):
-    content = []
-    elements = [element for element in extracted_page._objs]
-    elements.sort(key=lambda a: a.y1, reverse=True)
-    for i, element in enumerate(elements):
-        if isinstance(element, LTTextContainer):
-            line_text = extract_text_and_normalize(element)
-            content.append(line_text)
-    content = re.sub('\n+', '\n', ''.join(content))
-    return content
-def extract_text_and_normalize(element):
-    # Extract text from line and split it with new lines
-    line_texts = element.get_text().split('\n')
-    norm_text = ''
-    for line_text in line_texts:
-        line_text = line_text.strip()
-        if not line_text:
-            line_text = '\n'
-        else:
-            line_text = re.sub('\s+', ' ', line_text)
-            if not re.search('[\w\d\,\-]', line_text[-1]):
-                line_text += '\n'
-            else:
-                line_text += ' '
-        norm_text += line_text
-    return norm_text
-def txt_to_html(text):
-    html_content = "<html><body>"
-    for line in text.split('\n'):
-        html_content += "<p>{}</p>".format(line.strip())
-    html_content += "</body></html>"
-    return html_content
-def harmonize_doc(llm, pdftext, prompt, maxtokens, temperature, top_probability, model_name):
-    print('PDFText is ',pdftext)
-    # prompt = '''
-    #           Standardize the following colonoscopy report into the structured format. Extract all information as-is from the PDFs, making no changes to content. For fields that aren’t available in a report, use 'N/A'.
-    #           Structure as follows: 1. Patient Information: Name, date of birth, gender, address, and any ID numbers.
-    #                                 2. Procedure Details: Date, hospital, referring doctor, endoscopist, priority, and premedication.
-    #                                 3. Findings from report
-    #                                 4. Procedure Summary
-    #                                 5. Diagnosis and Follow-Up/Advice
-    #          '''
-    # prompt = "Please harmonize the following medical endoscopy report into a consistent format. The report should include the following standardized sections: Hospital Name, Patient Information (Name, NHS Number, Hospital Number, Date of Birth, Address), Date of Procedure, Referring Consultant, Endoscopist, Instrument Used, Medication, Patient Sedation, Indications for Procedure, Co-morbidities, Extent of Exam, Findings (site-by-site), Biopsy Details, Diagnosis, Management Plan, Follow-Up, and Additional Comments. If any information is not provided in the report, write 'N/A' for that field. Ensure that both reports follow this structure for clarity and consistency."
-    prompt = """
-            Please reformat the provided medical report into the following standardized structure:
-            1. Hospital Information:
-               - Name of Hospital: [Name of hospital]
-               - Department: [Relevant department or 'N/A']
-            2. Patient Information:
-               - Name: [Full Name]
-               - Gender: [Gender]
-               - Date of Birth: [Date of Birth]
-               - Address: [Full Address or 'N/A']
-               - ID Numbers:
-                 - [Relevant identifiers such as NHS Number, Case Number, etc.]
-            3. Procedure Details:
-               - Date of Procedure: [Date]
-               - Referring Doctor: [Name or 'N/A']
-               - Performed By:
-                 - Consultant: [Name or 'N/A']
-                 - Additional Clinicians: [Name(s) or 'N/A']
-                 - Nurses: [Name(s) or 'N/A']
-               - Details:
-                 - Indications: [Symptoms, reasons for procedure]
-                 - Instrument: [Instrument details or 'N/A']
-                 - Co-morbidities: [Relevant conditions or 'N/A']
-                 - ASA Status: [ASA classification or 'N/A']
-                 - Procedure: [Details of patient preparation and exact description of procedures performed as in the original report or 'N/A']
-                 - Findings: [Exact findings from the report, including any locations, measurements, or observations]
-                 - Specimens Taken: [Details on specimens, if any, or 'N/A']
-                 - Comments: [Additional notes, advice, or remarks from the report]
-            4. Diagnosis and Outcomes:
-               - Diagnosis: [Exact diagnosis or 'N/A']
-               - Therapeutic Actions: [Treatments performed or 'N/A']
-               - Complications: [Details on complications or 'No complications']
-               - Follow-Up: [Exact follow-up recommendations from the report]
-            Instructions for Output:
-            1. Use the exact wording and details from the original report wherever possible. Do not summarize or interpret information.
-            2. If any information is missing in the original report, use 'N/A' for the corresponding field.
-            3. Ensure the output matches the above structure exactly pointwise, without omitting any fields.
-            4. Retain all medical terms, values, and phrases as stated in the report.
-            """
     output = llm.create_chat_completion(
             messages=[
                 {"role": "assistant", "content": prompt},
                 {
                     "role": "user",
-                    "content": pdftext
                 }
             ],
             max_tokens=maxtokens,
             temperature=temperature
         )
     output = output['choices'][0]['message']['content']
     find_index = output.find(' '.join(pdftext.split()[:3]))
     if find_index != -1:
         output = output[find_index:].strip()
     return output
-@spaces.GPU(duration=120)
-def pdf_to_text(files, input_text='', prompt='', model_name='default', temperature=0, maxtokens=2048, top_probability=0.95):
-    llm = Llama(
-        model_path="models/" + model_id,
-        flash_attn=True,
-        n_gpu_layers=81,
-        n_batch=1024,
-        n_ctx=8192,
-    )
-    # llm = Llama.from_pretrained(
-    #     repo_id=local_dir,
-    #     filename=model_id,
-    # )
-    harmonized_text = ''
-    for file in files:
-        page2content = process_document(file)
-        pdftext = ''
-        for page_id in page2content:
-            pdftext += page2content[page_id]
-        # converter = DocumentConverter()
-        # result = converter.convert(file)
-        # pdftext = result.document.export_to_markdown()
-        input_text = pdftext
-        harmonized_text += harmonize_doc(llm, input_text, prompt, maxtokens, temperature, top_probability, model_name)
-        harmonized_text += '\n\n-----------------------------------------------------------------\n\n'
-    print('Harmonized text is ',harmonized_text)
-    return harmonized_text, input_text
 temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
-model_name = gr.Dropdown(["default", "fine-tuned"], label="LLama Model")
-max_tokens = gr.Number(value=600, label="Max Tokens")
-input_text = gr.Text(label='Input Text')
-input_prompt = gr.Text(label='Prompt')
-input_files = gr.File(file_count="multiple")
-output_path_component = gr.File(label="Select Output Path")
 iface = gr.Interface(
-    fn=pdf_to_text,
-    inputs=input_files,
-    outputs=['text', 'text'],
-    title='COBIx Endoscopy Report Harmonization',
-    description="This application helps standardize medical reports into a consistent format",
     theme=gr.themes.Soft(),
 )
 iface.launch()

 import re
 import gradio as gr
 import os
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 from huggingface_hub import login
 login(token = os.getenv('HF_TOKEN'))
+repo_id = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"
+model_id = "Meta-Llama-3-8B-Instruct-v2.Q2_K.gguf"
+# repo_id = "QuantFactory/Meta-Llama-3-70B-Instruct-GGUF"
+# model_id = "Meta-Llama-3-70B-Instruct.Q2_K.gguf"
 local_dir = "models"
     local_dir = local_dir
 )
+@spaces.GPU(duration=120)
+def get_itinerary(llm, information, maxtokens, temperature, top_probability):
+    llm = Llama(
+        model_path="models/" + model_id,
+        flash_attn=True,
+        n_gpu_layers=81,
+        n_batch=1024,
+        n_ctx=8192,
+    )
+    prompt = "Please prepare a nice and fancy itinerary for the place and information provided following: "
     output = llm.create_chat_completion(
             messages=[
                 {"role": "assistant", "content": prompt},
                 {
                     "role": "user",
+                    "content": information
                 }
             ],
             max_tokens=maxtokens,
             temperature=temperature
         )
     output = output['choices'][0]['message']['content']
     find_index = output.find(' '.join(pdftext.split()[:3]))
     if find_index != -1:
         output = output[find_index:].strip()
     return output
 temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
+max_tokens = gr.Number(value=1000, label="Max Tokens")
 iface = gr.Interface(
+    fn=get_itinerary,
+    inputs='text',
+    outputs='text',
+    title='VoyageX',
+    description="This application helps building itinerary",
     theme=gr.themes.Soft(),
 )
 iface.launch()