Spaces:
Running
Running
get data availability statement as context for QA
Browse files
document_qa/grobid_processors.py
CHANGED
|
@@ -183,6 +183,7 @@ class GrobidProcessor(BaseProcessor):
|
|
| 183 |
})
|
| 184 |
|
| 185 |
text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
|
|
|
|
| 186 |
|
| 187 |
use_paragraphs = True
|
| 188 |
if not use_paragraphs:
|
|
@@ -800,6 +801,20 @@ def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool
|
|
| 800 |
return nodes
|
| 801 |
|
| 802 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 803 |
def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
|
| 804 |
children = []
|
| 805 |
for child in soup.TEI.children:
|
|
|
|
| 183 |
})
|
| 184 |
|
| 185 |
text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
|
| 186 |
+
text_blocks_body.extend(get_xml_nodes_back(soup, verbose=False, use_paragraphs=True))
|
| 187 |
|
| 188 |
use_paragraphs = True
|
| 189 |
if not use_paragraphs:
|
|
|
|
| 801 |
return nodes
|
| 802 |
|
| 803 |
|
| 804 |
+
def get_xml_nodes_back(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
|
| 805 |
+
nodes = []
|
| 806 |
+
tag_name = "p" if use_paragraphs else "s"
|
| 807 |
+
for child in soup.TEI.children:
|
| 808 |
+
if child.name == 'text':
|
| 809 |
+
nodes.extend(
|
| 810 |
+
[subsubchild for subchild in child.find_all("back") for subsubchild in subchild.find_all(tag_name)])
|
| 811 |
+
|
| 812 |
+
if verbose:
|
| 813 |
+
print(str(nodes))
|
| 814 |
+
|
| 815 |
+
return nodes
|
| 816 |
+
|
| 817 |
+
|
| 818 |
def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
|
| 819 |
children = []
|
| 820 |
for child in soup.TEI.children:
|