File size: 3,006 Bytes
1bf0288 136cb9d 1bf0288 dfbf9e3 1bf0288 dfbf9e3 1bf0288 1a298d7 1bf0288 20223ca 1bf0288 20223ca 1bf0288 20223ca 1bf0288 20223ca 1bf0288 20223ca 81c519b 1bf0288 74354c5 20223ca 1bf0288 20223ca 1bf0288 20223ca 1bf0288 20223ca 1bf0288 06be297 1cfc7d3 1bf0288 fad8511 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
#the inference function
from transformers import FillMaskPipeline ,DistilBertTokenizer,TFAutoModelForMaskedLM,AutoTokenizer
from transformers import BertTokenizer
#load the tokenizer
tokenizer_path_1="./vocab.txt"
tokenizer_1 = BertTokenizer.from_pretrained(tokenizer_path_1)
#load the model path
model_path="./bert_lm_10"
model_1 = TFAutoModelForMaskedLM.from_pretrained(model_path)
#build the unmasker pipeline using HF for inference
unmasker = FillMaskPipeline(model=model_1,tokenizer=tokenizer_1)
#try on a sample of txt
txt="a polynomial [MASK] from 3-SAT." #reduction
#results=unmasker(txt,top_k=5)
#show the results
#for res in results:
#print(res["sequence"])
#print(res["score"])
#now for BERT on English
default_name="bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(default_name)
model = TFAutoModelForMaskedLM.from_pretrained(default_name)
unmasker_bert = FillMaskPipeline(model=model,tokenizer=tokenizer)
#make a function out of the unmasker
def unmask_words(txt_with_mask,k_suggestions=5):
results_cc=unmasker(txt_with_mask,top_k=k_suggestions)
labels={}
for res in results_cc:
labels["".join(res["token_str"].split(" "))]=res["score"]
results_bert=unmasker_bert(txt_with_mask,top_k=k_suggestions)
labels_bert={}
for res in results_bert:
labels_bert["".join(res["token_str"].split(" "))]=res["score"]
return labels,labels_bert
#trying our function
#val=unmask_words(txt)
import gradio as gr
description="""This is a demo to show the Masked Language model pretrained on data collected from ~197k papers on arXiv consisting of mathematical proofs and theorems.
The aim of this interface is to show the difference between English and scientific English pretraining.
For more information visit [Theoremkb Project](https://github.com/PierreSenellart/theoremkb)
or contact [[email protected]]([email protected]).
"""
examples=[["as pspace is [MASK] under complement."],
["n!-(n-1)[MASK]"],
["[MASK] these two classes is a major problem."],
["This would show that the polynomial hierarchy at the second [MASK], which is considered only"],
["""we consider two ways of measuring complexity, data complexity, which is with respect to the size of the data,
and their combined [MASK]"""]
]
input_box=gr.inputs.Textbox(lines=20,placeholder="Unifying computational entropies via Kullback–Leibler [MASK]",label="Enter the masked text:")
interface=gr.Interface(fn=unmask_words,inputs=[input_box,
gr.inputs.Slider(1,10,1,5,label="No of Suggestions:")],
outputs=[gr.outputs.Label(label="top words:"),gr.outputs.Label(label="top words eng-bert:")],
examples=examples,
theme="darkhuggingface",
title="CC-Bert MLM",description=description,allow_flagging=True)
interface.launch() |