SemanticSearch-HU / src /data /dbpedia_dump_wiki_text.py
endre sukosd
Streamlit app fixup
eb6656d
raw
history blame contribute delete
553 Bytes
from rdflib import Graph
# Downloaded from https://databus.dbpedia.org/dbpedia/text/short-abstracts
raw_data_path = 'data/raw/short-abstracts_lang=hu.ttl'
preprocessed_data_path = 'data/preprocessed/shortened_abstracts_hu_2021_09_01.txt'
g = Graph()
g.parse(raw_data_path, format='turtle')
i = 0
objects = []
with open(preprocessed_data_path, 'w') as f:
print(len(g))
for subject, predicate, object in g:
objects.append(object.replace(' +/-','').replace('\n',' '))
objects.append('\n')
i += 1
f.writelines(objects)