This is fine-tuned PoC of markuplm-base model for parsing news attributes from web-pages: author, pulication date, content and etc.
Inference example
Code accepts URL as input, loads a web-page and returns the json with extracted data (author, publication date, title and content).
id2label = {0: "none", 1:"title", 2:"content", 3:"author", 4: "date", 5: "header", 6: "footer", 7: "rail", 8: "advertisement", 9: "navigation"}
def eval(url):
current_dir = os.path.dirname(os.path.abspath(__file__))
model_folder = os.path.join(current_dir, 'models') # models folder is in the repository root
model_name = 'OxMarkupLM.pt'
processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
processor.parse_html = False
model_path = os.path.join(model_folder, model_name)
model = MarkupLMForTokenClassification.from_pretrained(
model_path, id2label=labels.id2label, label2id=labels.label2id
)
html = utils.clean_html(utils.get_html_content(url))
data = [utils.extract_nodes_and_feautures(html)]
example = utils.split_sliding_data(data, 10, 0)
title, author, date, content = [], [], [], []
for splited in example:
nodes, xpaths = splited['nodes'], splited['xpaths']
encoding = processor(
nodes=nodes, xpaths=xpaths, return_offsets_mapping=True,
padding="max_length", truncation=True, max_length=512, return_tensors="pt"
)
offset_mapping = encoding.pop("offset_mapping")
with torch.no_grad():
logits = model(**encoding).logits
predictions = logits.argmax(-1)
processed_words = []
for pred_id, word_id, offset in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist()):
if word_id is not None and offset[0] == 0:
if pred_id == 1:
title.append(nodes[word_id])
elif pred_id == 2 and word_id not in processed_words:
processed_words.append(word_id)
content.append(nodes[word_id])
elif pred_id == 3:
author.append(nodes[word_id])
elif pred_id == 4:
date.append(nodes[word_id])
title = rank_titles(title, '\n'.join(content))
return {
"model_name": model_name,
"url": url,
"title": title,
"author": author,
"date": date,
"content": content,
}
More details
More details how this model was finetuned you can find here Building LLM crawler
Inference Providers
NEW
This model is not currently available via any of the supported Inference Providers.
Model tree for olga-rondareva/OxMarkupLM
Base model
microsoft/markuplm-base