Spaces:

Shrey
/

cc-bert

Sleeping

File size: 3,006 Bytes

1bf0288
136cb9d
1bf0288
 
 
dfbf9e3
1bf0288
 
 
dfbf9e3
1bf0288
 
 
 
 
 
 
 
 
 
1a298d7
 
 
1bf0288
20223ca
 
 
 
1bf0288
20223ca
 
 
 
1bf0288
20223ca
 
1bf0288
20223ca
1bf0288
20223ca
 
 
 
 
 
 
 
 
81c519b
1bf0288
 
 
 
 
74354c5
 
20223ca
1bf0288
 
20223ca
1bf0288
 
 
20223ca
1bf0288
 
 
 
 
 
 
 
 
20223ca
1bf0288
06be297
1cfc7d3
1bf0288
fad8511

#the inference function
from transformers import FillMaskPipeline ,DistilBertTokenizer,TFAutoModelForMaskedLM,AutoTokenizer
from transformers import BertTokenizer

#load the tokenizer
tokenizer_path_1="./vocab.txt"
tokenizer_1 = BertTokenizer.from_pretrained(tokenizer_path_1)

#load the model path
model_path="./bert_lm_10"
model_1 = TFAutoModelForMaskedLM.from_pretrained(model_path)

#build the unmasker pipeline using HF for inference
unmasker = FillMaskPipeline(model=model_1,tokenizer=tokenizer_1)

#try on a sample of txt
txt="a polynomial [MASK] from 3-SAT." #reduction
#results=unmasker(txt,top_k=5)

#show the results
#for res in results:
    #print(res["sequence"])
    #print(res["score"])
    
#now for BERT on English
default_name="bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(default_name)

model = TFAutoModelForMaskedLM.from_pretrained(default_name)
unmasker_bert = FillMaskPipeline(model=model,tokenizer=tokenizer)
    
#make a function out of the unmasker
def unmask_words(txt_with_mask,k_suggestions=5):
    results_cc=unmasker(txt_with_mask,top_k=k_suggestions)
    
    labels={}
    for res in results_cc:
        labels["".join(res["token_str"].split(" "))]=res["score"]
        
    results_bert=unmasker_bert(txt_with_mask,top_k=k_suggestions)
    
    labels_bert={}
    for res in results_bert:
        labels_bert["".join(res["token_str"].split(" "))]=res["score"]
        
    return labels,labels_bert
    
   

#trying our function
#val=unmask_words(txt)

import gradio as gr
description="""This is a demo to show the Masked Language model pretrained on data collected from ~197k papers on arXiv consisting of mathematical proofs and theorems.
The aim of this interface is to show the difference between English and scientific English pretraining.
For more information visit [Theoremkb Project](https://github.com/PierreSenellart/theoremkb)
or contact [[email protected]]([email protected]).
"""

examples=[["as pspace is [MASK] under complement."],
          ["n!-(n-1)[MASK]"],
         ["[MASK] these two classes is a major problem."],
          ["This would show that the polynomial hierarchy at the second [MASK], which is considered only"],
          ["""we consider two ways of measuring complexity, data complexity, which is with respect to the size of the data,
    and their combined [MASK]"""]
         ]



input_box=gr.inputs.Textbox(lines=20,placeholder="Unifying computational entropies via Kullback–Leibler [MASK]",label="Enter the masked text:")
interface=gr.Interface(fn=unmask_words,inputs=[input_box,
                                               gr.inputs.Slider(1,10,1,5,label="No of Suggestions:")],
                       outputs=[gr.outputs.Label(label="top words:"),gr.outputs.Label(label="top words eng-bert:")],
                       examples=examples,
                       theme="darkhuggingface",
                      title="CC-Bert MLM",description=description,allow_flagging=True)
                    
interface.launch()