asnassar commited on
Commit
61303b8
·
verified ·
1 Parent(s): 4ff2750

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -9
app.py CHANGED
@@ -99,7 +99,6 @@ def model_inference(
99
  "pixel_values": inputs.pixel_values,
100
  "attention_mask": inputs.attention_mask,
101
  "num_return_sequences": 1,
102
- "no_repeat_ngram_size": 10,
103
  "max_new_tokens": 8192,
104
  }
105
 
@@ -111,24 +110,30 @@ def model_inference(
111
 
112
  yield "..."
113
  buffer = ""
114
- doctag_output = ""
115
 
116
  for new_text in streamer:
117
- if new_text != "<end_of_utterance>":
118
- buffer += html.escape(new_text)
119
- doctag_output += new_text
120
  yield buffer
121
 
122
- if any(tag in doctag_output for tag in ["<doctag>", "<otsl>", "<code>", "<formula>", "<chart>"]):
123
- # final_output = buffer
124
- # cleaned_output = final_output[len(inputs.input_ids):] if len(final_output) > prompt_length else final_output
 
 
 
 
125
  doc = DoclingDocument(name="Document")
126
  if "<chart>" in doctag_output:
127
  doctag_output = doctag_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
128
  doctag_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', doctag_output)
129
-
 
 
130
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctag_output], images)
131
  doc.load_from_doctags(doctags_doc)
 
132
  yield f"**MD Output:**\n\n{doc.export_to_markdown()}"
133
 
134
  examples=[[{"text": "Convert this page to docling.", "files": ["example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png"]}],
 
99
  "pixel_values": inputs.pixel_values,
100
  "attention_mask": inputs.attention_mask,
101
  "num_return_sequences": 1,
 
102
  "max_new_tokens": 8192,
103
  }
104
 
 
110
 
111
  yield "..."
112
  buffer = ""
113
+ full_output = ""
114
 
115
  for new_text in streamer:
116
+ full_output += new_text
117
+ buffer += html.escape(new_text)
 
118
  yield buffer
119
 
120
+ cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
121
+
122
+ if cleaned_output:
123
+ doctag_output = cleaned_output
124
+ yield cleaned_output
125
+
126
+ if any(tag in doctag_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
127
  doc = DoclingDocument(name="Document")
128
  if "<chart>" in doctag_output:
129
  doctag_output = doctag_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
130
  doctag_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', doctag_output)
131
+
132
+ print(doctag_output)
133
+
134
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctag_output], images)
135
  doc.load_from_doctags(doctags_doc)
136
+ print(doc)
137
  yield f"**MD Output:**\n\n{doc.export_to_markdown()}"
138
 
139
  examples=[[{"text": "Convert this page to docling.", "files": ["example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png"]}],