Spaces:

obss
/

question-answering-demo

Runtime error

App Files Files Community

secilozksen commited on Nov 22, 2022

Commit

38a4d89

1 Parent(s): a3f655e

Upload 8 files

Browse files

Files changed (9) hide show

.gitattributes +1 -0
README.md +17 -12
context-embeddings.pkl +3 -0
demov2.py +304 -0
policyQA.json +0 -0
policyQA_bsbs.csv +0 -0
policyQA_bsbs_sentence.csv +0 -0
policyQA_original.csv +3 -0
requirements.txt +150 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+policyQA_original.csv filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,17 @@
----
-title: Question Answering Demo
-emoji: 📉
-colorFrom: gray
-colorTo: yellow
-sdk: streamlit
-sdk_version: 1.10.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# QuestionAnsweringDemo
+## Create the environment
+conda env create --file environment.yml
+conda activate QADemo
+After installing requirements, please make sure that you add huggingface authorization token to your ./.streamlit/secret.toml file.
+It should be something like:
+AUTH_TOKEN='your_auth_token_here'
+## Runing the app:
+streamlit run demov2.py

context-embeddings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9051e569255d71a5dbece9ebe371c81c0ef1a2ab9af91dc23d27eddb61943310
+size 6562679

demov2.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import copy
+import streamlit as st
+import json
+import pandas as pd
+import tokenizers
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+from transformers import pipeline
+from st_aggrid import GridOptionsBuilder, AgGrid
+import pickle
+import torch
+from transformers import RobertaTokenizer, RobertaForSequenceClassification
+import spacy
+import regex
+from typing import List
+from torch.autograd import Variable
+st.set_page_config(layout="wide")
+DATAFRAME_FILE_ORIGINAL = 'policyQA_original.csv'
+DATAFRAME_FILE_BSBS = 'policyQA_bsbs_sentence.csv'
+@st.experimental_singleton(suppress_st_warning=True, show_spinner=False)
+def cross_encoder_init():
+    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+    return cross_encoder
+@st.experimental_singleton(suppress_st_warning=True, show_spinner=False)
+def bi_encoder_init():
+    bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
+    bi_encoder.max_seq_length = 500  # Truncate long passages to 256 tokens
+    return bi_encoder
+@st.experimental_singleton(suppress_st_warning=True, show_spinner=False)
+def nlp_init(auth_token, private_model_name):
+    return pipeline('question-answering', model=private_model_name, tokenizer=private_model_name,
+                    use_auth_token=auth_token,
+                    revision="main")
+@st.experimental_singleton(suppress_st_warning=True, show_spinner=False)
+def nlp_pipeline_hf():
+    model_name = "deepset/roberta-base-squad2"
+    return pipeline('question-answering', model=model_name, tokenizer=model_name)
+@st.experimental_singleton(suppress_st_warning=True, show_spinner=False)
+def nlp_pipeline_sentence_based(auth_token, private_model_name):
+    tokenizer = RobertaTokenizer.from_pretrained(private_model_name, use_auth_token=auth_token)
+    model = RobertaForSequenceClassification.from_pretrained(private_model_name, use_auth_token=auth_token)
+    return tokenizer, model
+@st.cache(hash_funcs={tokenizers.Tokenizer: lambda _: None, tokenizers.AddedToken: lambda _: None,
+                      regex.Pattern: lambda _: None}, show_spinner=False)
+def load_models_sentence_based(auth_token, private_model_name, private_model_name_base):
+    bi_encoder = bi_encoder_init()
+    cross_encoder = cross_encoder_init()
+    # OLD MODEL
+    # nlp = nlp_init(auth_token, private_model_name)
+    # nlp_hf = nlp_pipeline_hf()
+    policy_qa_tokenizer, policy_qa_model = nlp_pipeline_sentence_based(auth_token, private_model_name)
+    asnq_tokenizer, asnq_model = nlp_pipeline_sentence_based(auth_token, private_model_name_base)
+    return bi_encoder, cross_encoder, policy_qa_tokenizer, policy_qa_model, asnq_tokenizer, asnq_model
+@st.cache(hash_funcs={tokenizers.Tokenizer: lambda _: None, tokenizers.AddedToken: lambda _: None}, show_spinner=False)
+def load_models(auth_token, private_model_name):
+    bi_encoder = bi_encoder_init()
+    cross_encoder = cross_encoder_init()
+    nlp = nlp_init(auth_token, private_model_name)
+    nlp_hf = nlp_pipeline_hf()
+    return bi_encoder, cross_encoder, nlp, nlp_hf
+def context():
+    bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device='cpu')
+    with open("/home/secilsen/PycharmProjects/SquadOperations/contexes.json", 'r', encoding='utf-8') as f:
+        paragraphs = json.load(f)
+        paragraphs = paragraphs['contexes']
+    with open('context-embeddings.pkl', "wb") as fIn:
+        context_embeddings = bi_encoder.encode(paragraphs, convert_to_tensor=True, show_progress_bar=True)
+        pickle.dump({'contexes': paragraphs, 'embeddings': context_embeddings}, fIn)
+@st.cache(show_spinner=False)
+def load_paragraphs():
+    with open('context-embeddings.pkl', "rb") as fIn:
+        cache_data = pickle.load(fIn)
+        corpus_sentences = cache_data['contexes']
+        corpus_embeddings = cache_data['embeddings']
+    return corpus_embeddings, corpus_sentences
+@st.cache(show_spinner=False)
+def load_dataframes():
+    data_original = pd.read_csv(DATAFRAME_FILE_ORIGINAL, index_col=0, sep='|')
+    data_bsbs = pd.read_csv(DATAFRAME_FILE_BSBS, index_col=0, sep='|')
+    data_original = data_original.sample(frac=1).reset_index(drop=True)
+    data_bsbs = data_bsbs.sample(frac=1).reset_index(drop=True)
+    return data_original, data_bsbs
+def search(question, corpus_embeddings, contexes, bi_encoder, cross_encoder):
+    # Semantic Search (Retrieve)
+    question_embedding = bi_encoder.encode(question, convert_to_tensor=True)
+    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=100)
+    if len(hits) == 0:
+        return []
+    hits = hits[0]
+    # Rerank - score all retrieved passages with cross-encoder
+    cross_inp = [[question, contexes[hit['corpus_id']]] for hit in hits]
+    cross_scores = cross_encoder.predict(cross_inp)
+    # Sort results by the cross-encoder scores
+    for idx in range(len(cross_scores)):
+        hits[idx]['cross-score'] = cross_scores[idx]
+    # Output of top-5 hits from re-ranker
+    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    top_5_contexes = []
+    top_5_scores = []
+    for hit in hits[0:20]:
+        top_5_contexes.append(contexes[hit['corpus_id']])
+        top_5_scores.append(hit['cross-score'])
+    return top_5_contexes, top_5_scores
+def paragraph_embeddings():
+    paragraphs = load_paragraphs()
+    context_embeddings = bi_encoder.encode(paragraphs, convert_to_tensor=True, show_progress_bar=True)
+    return context_embeddings, paragraphs
+def retrieve_rerank_pipeline(question, context_embeddings, paragraphs, bi_encoder, cross_encoder):
+    top_5_contexes, top_5_scores = search(question, context_embeddings, paragraphs, bi_encoder, cross_encoder)
+    return top_5_contexes, top_5_scores
+def qa_pipeline(question, context, nlp):
+    return nlp({'question': question.strip(), 'context': context})
+def qa_pipeline_sentence(question, context, model, tokenizer):
+    sentences_doc = spacy_nlp(context)
+    candidate_sentences = []
+    for sentence in sentences_doc.sents:
+        tokenized = tokenizer(f"<s> {question} </s> {sentence.text} </s>", padding=True, truncation=True, return_tensors='pt')
+        output = model(**tokenized)
+        soft_outputs = torch.nn.functional.sigmoid(output[0])
+        t = Variable(torch.Tensor([0.2]))  # threshold
+        out = (soft_outputs[0] > t) * 1
+        out = out.flatten().cpu().detach().numpy()
+     #   res = torch.argmax(out, dim=-1)
+        print(out[1])
+        if out[1] == 1:
+            prob = soft_outputs[:, 1].flatten().cpu().detach().numpy()
+            candidate_sentences.append(dict(sentence=sentence,
+                                            prob=prob[0]))
+    print(candidate_sentences)
+    candidate_sentences = sorted(candidate_sentences, key=lambda x: x['prob'], reverse=True)
+    return candidate_sentences
+def candidate_sentence_controller(sentences):
+    if sentences is None or len(sentences) == 0:
+        return ""
+    if len(sentences) == 1:
+        return sentences[0]
+    return sentences
+def interactive_table(dataframe):
+    gb = GridOptionsBuilder.from_dataframe(dataframe)
+    gb.configure_pagination(paginationAutoPageSize=True)
+    gb.configure_side_bar()
+    gb.configure_selection('single', rowMultiSelectWithClick=True,
+                           groupSelectsChildren="Group checkbox select children")  # Enable multi-row selection
+    gridOptions = gb.build()
+    grid_response = AgGrid(
+        dataframe,
+        gridOptions=gridOptions,
+        data_return_mode='AS_INPUT',
+        update_mode='SELECTION_CHANGED',
+        enable_enterprise_modules=False,
+        fit_columns_on_grid_load=False,
+        theme='streamlit',  # Add theme color to the table
+        height=350,
+        width='100%',
+        reload_data=False
+    )
+    return grid_response
+def qa_main_widgetsv2():
+    st.title("Question Answering Demo")
+    col1, col2, col3 = st.columns([2, 1, 1])
+    with col1:
+        form = st.form(key='first_form')
+        question = form.text_area("What is your question?:", height=200)
+        submit = form.form_submit_button('Submit')
+        if "form_submit" not in st.session_state:
+            st.session_state.form_submit = False
+        if submit:
+            st.session_state.form_submit = True
+        if st.session_state.form_submit and question != '':
+            with st.spinner(text='Related context search in progress..'):
+                top_5_contexes, top_5_scores = retrieve_rerank_pipeline(question.strip(), context_embeddings,
+                                                                        paragraphs, bi_encoder,
+                                                                        cross_encoder)
+            if len(top_5_contexes) == 0:
+                st.error("Related context not found!")
+                st.session_state.form_submit = False
+            else:
+                with st.spinner(text='Now answering your question..'):
+                    for i, context in enumerate(top_5_contexes):
+                        #  answer_trained = qa_pipeline(question, context, nlp)
+                        # answer_base = qa_pipeline(question, context, nlp_hf)
+                        answer_trained = qa_pipeline_sentence(question, context, policy_qa_model, policy_qa_tokenizer)
+                        answer_base = qa_pipeline_sentence(question, context, asnq_model, asnq_tokenizer)
+                        st.markdown(f"## Related Context - {i + 1} (score: {top_5_scores[i]:.2f})")
+                        st.markdown(context)
+                        st.markdown("## Answer (trained):")
+                        if answer_trained is None:
+                            st.markdown("")
+                        elif isinstance(answer_trained, List):
+                            for i,answer in enumerate(answer_trained):
+                                st.markdown(f"### Answer Option {i+1} with prob. {answer['prob']:.4f}")
+                                st.markdown(answer['sentence'])
+                        else:
+                            st.markdown(answer_trained)
+                        # st.markdown(answer_trained['answer'])
+                        st.markdown("## Answer (roberta-base-asnq):")
+                        if answer_base is None:
+                            st.markdown("")
+                        elif isinstance(answer_base, List):
+                            for i,answer in enumerate(answer_base):
+                                st.markdown(f"### Answer Option {i + 1} with prob. {answer['prob']:.4f}")
+                                st.markdown(answer['sentence'])
+                        else:
+                            st.markdown(answer_base)
+                        st.markdown("""---""")
+    with col2:
+        st.markdown("## Original Questions")
+        grid_response = interactive_table(dataframe_original)
+        data1 = grid_response['selected_rows']
+        if "grid_click_1" not in st.session_state:
+            st.session_state.grid_click_1 = False
+        if len(data1) > 0:
+            st.session_state.grid_click_1 = True
+        if st.session_state.grid_click_1:
+            selection = data1[0]
+            #   st.markdown("## Context & Answer:")
+            st.markdown("### Context:")
+            st.write(selection['context'])
+            st.markdown("### Question:")
+            st.write(selection['question'])
+            st.markdown("### Answer:")
+            st.write(selection['answer'])
+            st.session_state.grid_click_1 = False
+    with col3:
+        st.markdown("## Our Questions")
+        grid_response = interactive_table(dataframe_bsbs)
+        data2 = grid_response['selected_rows']
+        if "grid_click_2" not in st.session_state:
+            st.session_state.grid_click_2 = False
+        if len(data2) > 0:
+            st.session_state.grid_click_2 = True
+        if st.session_state.grid_click_2:
+            selection = data2[0]
+            #   st.markdown("## Context & Answer:")
+            st.markdown("### Context:")
+            st.write(selection['context'])
+            st.markdown("### Question:")
+            st.write(selection['question'])
+            st.markdown("### Answer:")
+            st.write(selection['answer'])
+            st.session_state.grid_click_2 = False
+def load():
+    context_embeddings, paragraphs = load_paragraphs()
+    dataframe_original, dataframe_bsbs = load_dataframes()
+    spacy_nlp = spacy.load('en_core_web_sm')
+    # bi_encoder, cross_encoder, nlp, nlp_hf = copy.deepcopy(load(st.secrets["AUTH_TOKEN"], st.secrets["MODEL_NAME"]))
+    bi_encoder, cross_encoder, policy_qa_tokenizer, policy_qa_model, asnq_tokenizer, asnq_model \
+        = copy.deepcopy(
+        load_models_sentence_based(st.secrets["AUTH_TOKEN"], st.secrets["MODEL_NAME"], st.secrets["MODEL_NAME_BASE"]))
+    return context_embeddings, paragraphs, dataframe_original, dataframe_bsbs, bi_encoder, cross_encoder, policy_qa_tokenizer, policy_qa_model, asnq_tokenizer, asnq_model, spacy_nlp
+#   save_dataframe()
+# context_embeddings, paragraphs, dataframe_original, dataframe_bsbs, bi_encoder, cross_encoder, nlp, nlp_hf = load()
+context_embeddings, paragraphs, dataframe_original, dataframe_bsbs, bi_encoder, cross_encoder, policy_qa_tokenizer, policy_qa_model, asnq_tokenizer, asnq_model, spacy_nlp = load()
+qa_main_widgetsv2()
+# if __name__ == '__main__':
+#    context()

policyQA.json ADDED Viewed

The diff for this file is too large to render. See raw diff

policyQA_bsbs.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

policyQA_bsbs_sentence.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

policyQA_original.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f7b4cb4bd7c65a11f21a0553c0a419c424639a6a123cdf89ecbb05ad849b7a6
+size 28581894

requirements.txt ADDED Viewed

	@@ -0,0 +1,150 @@

+altair==4.2.0
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.0.5
+attrs==21.4.0
+backcall==0.2.0
+bleach==5.0.1
+blinker==1.5
+blis==0.7.9
+brotlipy==0.7.0
+cachetools==5.2.0
+catalogue==2.0.8
+certifi==2022.9.24
+cffi==1.15.1
+charset-normalizer==2.1.1
+click==8.1.3
+commonmark==0.9.1
+cryptography==38.0.3
+cycler==0.11.0
+cymem==2.0.7
+debugpy==1.6.0
+decorator==5.1.1
+defusedxml==0.7.1
+en-core-web-sm==3.2.0
+entrypoints==0.4
+executing==0.8.3
+fastjsonschema==2.15.3
+filelock==3.8.0
+fonttools==4.33.3
+gitdb==4.0.9
+GitPython==3.1.29
+huggingface-hub==0.10.0
+idna==3.4
+importlib-metadata==5.0.0
+ipykernel==6.15.0
+ipython==8.4.0
+ipython-genutils==0.2.0
+ipywidgets==7.7.1
+jedi==0.18.1
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.6.0
+jupyter==1.0.0
+jupyter-client==7.3.4
+jupyter-console==6.4.4
+jupyter-core==4.10.0
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==1.1.1
+kiwisolver==1.4.3
+langcodes==3.3.0
+MarkupSafe==2.1.1
+matplotlib==3.5.2
+matplotlib-inline==0.1.3
+mistune==0.8.4
+mkl-fft==1.3.1
+mkl-random==1.2.2
+mkl-service==2.4.0
+mpmath==1.2.1
+murmurhash==1.0.9
+nbclient==0.6.4
+nbconvert==6.5.0
+nbformat==5.4.0
+nest-asyncio==1.5.5
+nltk==3.7
+nose==1.3.7
+notebook==6.4.12
+numpy==1.23.3
+packaging==21.3
+pandas==1.5.0
+pandocfilters==1.5.0
+parso==0.8.3
+pathy==0.6.2
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.2.0
+pip==22.2.2
+preshed==3.0.8
+prometheus-client==0.14.1
+prompt-toolkit==3.0.30
+protobuf==3.20.3
+psutil==5.9.1
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==10.0.0
+pycparser==2.21
+pydantic==1.8.2
+pydeck==0.8.0b4
+Pygments==2.12.0
+Pympler==1.0.1
+pyOpenSSL==22.1.0
+pyparsing==3.0.9
+pyrsistent==0.18.1
+PySocks==1.7.1
+python-dateutil==2.8.2
+python-decouple==3.6
+pytz==2022.6
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+pyzmq==23.2.0
+qtconsole==5.3.1
+QtPy==2.1.0
+regex==2022.10.31
+requests==2.28.1
+rich==12.6.0
+scikit-learn==1.1.2
+scipy==1.9.2
+semver==2.13.0
+Send2Trash==1.8.0
+sentence-transformers==2.2.2
+sentencepiece==0.1.97
+setuptools==65.5.0
+six==1.16.0
+smart-open==5.2.1
+smmap==5.0.0
+soupsieve==2.3.2.post1
+spacy==3.2.0
+spacy-legacy==3.0.10
+spacy-loggers==1.0.3
+srsly==2.4.5
+stack-data==0.3.0
+streamlit==1.13.0
+streamlit-aggrid==0.3.3
+sympy==1.10.1
+terminado==0.15.0
+thinc==8.0.17
+threadpoolctl==3.1.0
+tinycss2==1.1.1
+tokenizers==0.12.1
+toml==0.10.2
+toolz==0.12.0
+torch==1.12.1
+torchaudio==0.12.1
+torchvision==0.13.1
+tornado==6.1
+tqdm==4.64.1
+traitlets==5.3.0
+transformers==4.22.2
+typer==0.4.2
+typing_extensions==4.4.0
+tzdata==2022.6
+tzlocal==4.2
+urllib3==1.26.11
+validators==0.20.0
+wasabi==0.10.1
+watchdog==2.1.9
+wcwidth==0.2.5
+webencodings==0.5.1
+wheel==0.37.1
+widgetsnbextension==3.6.1
+zipp==3.10.0