srijaydeshpande commited on
Commit
0c4fe37
·
verified ·
1 Parent(s): 8b2ea04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -157
app.py CHANGED
@@ -1,5 +1,3 @@
1
- from pdfminer.high_level import extract_pages
2
- from pdfminer.layout import LTTextContainer
3
  import re
4
  import gradio as gr
5
  import os
@@ -10,15 +8,14 @@ import subprocess
10
  from huggingface_hub import hf_hub_download
11
  from llama_cpp import Llama
12
  from huggingface_hub import login
13
- # from docling.document_converter import DocumentConverter
14
 
15
  login(token = os.getenv('HF_TOKEN'))
16
 
17
- # repo_id = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"
18
- # model_id = "Meta-Llama-3-8B-Instruct-v2.Q2_K.gguf"
19
 
20
- repo_id = "QuantFactory/Meta-Llama-3-70B-Instruct-GGUF"
21
- model_id = "Meta-Llama-3-70B-Instruct.Q2_K.gguf"
22
 
23
  local_dir = "models"
24
 
@@ -28,175 +25,42 @@ hf_hub_download(
28
  local_dir = local_dir
29
  )
30
 
31
- def process_document(pdf_path):
32
- extracted_pages = extract_pages(pdf_path)
33
- page2content = {}
34
- for extracted_page in tqdm(extracted_pages):
35
- page_id = extracted_page.pageid
36
- content = process_page(extracted_page)
37
- page2content[page_id] = content
38
- return page2content
39
-
40
-
41
- def process_page(extracted_page):
42
- content = []
43
- elements = [element for element in extracted_page._objs]
44
- elements.sort(key=lambda a: a.y1, reverse=True)
45
- for i, element in enumerate(elements):
46
- if isinstance(element, LTTextContainer):
47
- line_text = extract_text_and_normalize(element)
48
- content.append(line_text)
49
- content = re.sub('\n+', '\n', ''.join(content))
50
- return content
51
-
52
-
53
- def extract_text_and_normalize(element):
54
- # Extract text from line and split it with new lines
55
- line_texts = element.get_text().split('\n')
56
- norm_text = ''
57
- for line_text in line_texts:
58
- line_text = line_text.strip()
59
- if not line_text:
60
- line_text = '\n'
61
- else:
62
- line_text = re.sub('\s+', ' ', line_text)
63
- if not re.search('[\w\d\,\-]', line_text[-1]):
64
- line_text += '\n'
65
- else:
66
- line_text += ' '
67
- norm_text += line_text
68
- return norm_text
69
-
70
-
71
- def txt_to_html(text):
72
- html_content = "<html><body>"
73
- for line in text.split('\n'):
74
- html_content += "<p>{}</p>".format(line.strip())
75
- html_content += "</body></html>"
76
- return html_content
77
-
78
- def harmonize_doc(llm, pdftext, prompt, maxtokens, temperature, top_probability, model_name):
79
- print('PDFText is ',pdftext)
80
-
81
- # prompt = '''
82
- # Standardize the following colonoscopy report into the structured format. Extract all information as-is from the PDFs, making no changes to content. For fields that aren’t available in a report, use 'N/A'.
83
- # Structure as follows: 1. Patient Information: Name, date of birth, gender, address, and any ID numbers.
84
- # 2. Procedure Details: Date, hospital, referring doctor, endoscopist, priority, and premedication.
85
- # 3. Findings from report
86
- # 4. Procedure Summary
87
- # 5. Diagnosis and Follow-Up/Advice
88
- # '''
89
-
90
- # prompt = "Please harmonize the following medical endoscopy report into a consistent format. The report should include the following standardized sections: Hospital Name, Patient Information (Name, NHS Number, Hospital Number, Date of Birth, Address), Date of Procedure, Referring Consultant, Endoscopist, Instrument Used, Medication, Patient Sedation, Indications for Procedure, Co-morbidities, Extent of Exam, Findings (site-by-site), Biopsy Details, Diagnosis, Management Plan, Follow-Up, and Additional Comments. If any information is not provided in the report, write 'N/A' for that field. Ensure that both reports follow this structure for clarity and consistency."
91
-
92
- prompt = """
93
- Please reformat the provided medical report into the following standardized structure:
94
-
95
- 1. Hospital Information:
96
- - Name of Hospital: [Name of hospital]
97
- - Department: [Relevant department or 'N/A']
98
-
99
- 2. Patient Information:
100
- - Name: [Full Name]
101
- - Gender: [Gender]
102
- - Date of Birth: [Date of Birth]
103
- - Address: [Full Address or 'N/A']
104
- - ID Numbers:
105
- - [Relevant identifiers such as NHS Number, Case Number, etc.]
106
-
107
- 3. Procedure Details:
108
- - Date of Procedure: [Date]
109
- - Referring Doctor: [Name or 'N/A']
110
- - Performed By:
111
- - Consultant: [Name or 'N/A']
112
- - Additional Clinicians: [Name(s) or 'N/A']
113
- - Nurses: [Name(s) or 'N/A']
114
- - Details:
115
- - Indications: [Symptoms, reasons for procedure]
116
- - Instrument: [Instrument details or 'N/A']
117
- - Co-morbidities: [Relevant conditions or 'N/A']
118
- - ASA Status: [ASA classification or 'N/A']
119
- - Procedure: [Details of patient preparation and exact description of procedures performed as in the original report or 'N/A']
120
- - Findings: [Exact findings from the report, including any locations, measurements, or observations]
121
- - Specimens Taken: [Details on specimens, if any, or 'N/A']
122
- - Comments: [Additional notes, advice, or remarks from the report]
123
-
124
- 4. Diagnosis and Outcomes:
125
- - Diagnosis: [Exact diagnosis or 'N/A']
126
- - Therapeutic Actions: [Treatments performed or 'N/A']
127
- - Complications: [Details on complications or 'No complications']
128
- - Follow-Up: [Exact follow-up recommendations from the report]
129
-
130
- Instructions for Output:
131
- 1. Use the exact wording and details from the original report wherever possible. Do not summarize or interpret information.
132
- 2. If any information is missing in the original report, use 'N/A' for the corresponding field.
133
- 3. Ensure the output matches the above structure exactly pointwise, without omitting any fields.
134
- 4. Retain all medical terms, values, and phrases as stated in the report.
135
- """
136
-
137
-
138
-
139
  output = llm.create_chat_completion(
140
  messages=[
141
  {"role": "assistant", "content": prompt},
142
  {
143
  "role": "user",
144
- "content": pdftext
145
  }
146
  ],
147
  max_tokens=maxtokens,
148
  temperature=temperature
149
  )
150
-
151
  output = output['choices'][0]['message']['content']
152
  find_index = output.find(' '.join(pdftext.split()[:3]))
153
  if find_index != -1:
154
  output = output[find_index:].strip()
155
  return output
156
 
157
-
158
- @spaces.GPU(duration=120)
159
- def pdf_to_text(files, input_text='', prompt='', model_name='default', temperature=0, maxtokens=2048, top_probability=0.95):
160
- llm = Llama(
161
- model_path="models/" + model_id,
162
- flash_attn=True,
163
- n_gpu_layers=81,
164
- n_batch=1024,
165
- n_ctx=8192,
166
- )
167
- # llm = Llama.from_pretrained(
168
- # repo_id=local_dir,
169
- # filename=model_id,
170
- # )
171
- harmonized_text = ''
172
- for file in files:
173
- page2content = process_document(file)
174
- pdftext = ''
175
- for page_id in page2content:
176
- pdftext += page2content[page_id]
177
- # converter = DocumentConverter()
178
- # result = converter.convert(file)
179
- # pdftext = result.document.export_to_markdown()
180
- input_text = pdftext
181
- harmonized_text += harmonize_doc(llm, input_text, prompt, maxtokens, temperature, top_probability, model_name)
182
- harmonized_text += '\n\n-----------------------------------------------------------------\n\n'
183
- print('Harmonized text is ',harmonized_text)
184
- return harmonized_text, input_text
185
-
186
 
187
  temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
188
- model_name = gr.Dropdown(["default", "fine-tuned"], label="LLama Model")
189
- max_tokens = gr.Number(value=600, label="Max Tokens")
190
- input_text = gr.Text(label='Input Text')
191
- input_prompt = gr.Text(label='Prompt')
192
- input_files = gr.File(file_count="multiple")
193
- output_path_component = gr.File(label="Select Output Path")
194
  iface = gr.Interface(
195
- fn=pdf_to_text,
196
- inputs=input_files,
197
- outputs=['text', 'text'],
198
- title='COBIx Endoscopy Report Harmonization',
199
- description="This application helps standardize medical reports into a consistent format",
200
  theme=gr.themes.Soft(),
201
  )
202
  iface.launch()
 
 
 
1
  import re
2
  import gradio as gr
3
  import os
 
8
  from huggingface_hub import hf_hub_download
9
  from llama_cpp import Llama
10
  from huggingface_hub import login
 
11
 
12
  login(token = os.getenv('HF_TOKEN'))
13
 
14
+ repo_id = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"
15
+ model_id = "Meta-Llama-3-8B-Instruct-v2.Q2_K.gguf"
16
 
17
+ # repo_id = "QuantFactory/Meta-Llama-3-70B-Instruct-GGUF"
18
+ # model_id = "Meta-Llama-3-70B-Instruct.Q2_K.gguf"
19
 
20
  local_dir = "models"
21
 
 
25
  local_dir = local_dir
26
  )
27
 
28
+ @spaces.GPU(duration=120)
29
+ def get_itinerary(llm, information, maxtokens, temperature, top_probability):
30
+ llm = Llama(
31
+ model_path="models/" + model_id,
32
+ flash_attn=True,
33
+ n_gpu_layers=81,
34
+ n_batch=1024,
35
+ n_ctx=8192,
36
+ )
37
+ prompt = "Please prepare a nice and fancy itinerary for the place and information provided following: "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  output = llm.create_chat_completion(
39
  messages=[
40
  {"role": "assistant", "content": prompt},
41
  {
42
  "role": "user",
43
+ "content": information
44
  }
45
  ],
46
  max_tokens=maxtokens,
47
  temperature=temperature
48
  )
 
49
  output = output['choices'][0]['message']['content']
50
  find_index = output.find(' '.join(pdftext.split()[:3]))
51
  if find_index != -1:
52
  output = output[find_index:].strip()
53
  return output
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
57
+ max_tokens = gr.Number(value=1000, label="Max Tokens")
 
 
 
 
 
58
  iface = gr.Interface(
59
+ fn=get_itinerary,
60
+ inputs='text',
61
+ outputs='text',
62
+ title='VoyageX',
63
+ description="This application helps building itinerary",
64
  theme=gr.themes.Soft(),
65
  )
66
  iface.launch()