edithram23 commited on
Commit
fb16cd6
·
verified ·
1 Parent(s): 0b66063

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -0
app.py CHANGED
@@ -46,6 +46,38 @@ def read_pdf(file):
46
  text += page.get_text()
47
  return text, pdf_document
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def read_docx(file):
50
  doc = Document(file)
51
  text = "\n".join([para.text for para in doc.paragraphs])
 
46
  text += page.get_text()
47
  return text, pdf_document
48
 
49
+ def combine_words(entities):
50
+ combined_entities = []
51
+ current_entity = None
52
+
53
+ for entity in entities:
54
+ if current_entity:
55
+ if current_entity['end'] == entity['start']:
56
+ # Combine the words without space
57
+ current_entity['word'] += entity['word'].replace('##', '')
58
+ current_entity['end'] = entity['end']
59
+ elif current_entity['end'] + 1 == entity['start']:
60
+ # Combine the words with a space
61
+ current_entity['word'] += ' ' + entity['word'].replace('##', '')
62
+ current_entity['end'] = entity['end']
63
+ else:
64
+ # Add the previous combined entity to the list
65
+ combined_entities.append(current_entity)
66
+ # Start a new entity
67
+ current_entity = entity.copy()
68
+ current_entity['word'] = current_entity['word'].replace('##', '')
69
+ else:
70
+ # Initialize the first entity
71
+ current_entity = entity.copy()
72
+ current_entity['word'] = current_entity['word'].replace('##', '')
73
+
74
+ # Add the last entity
75
+ if current_entity:
76
+ combined_entities.append(current_entity)
77
+
78
+ return combined_entities
79
+
80
+
81
  def read_docx(file):
82
  doc = Document(file)
83
  text = "\n".join([para.text for para in doc.paragraphs])