Spaces:

Kaelan
/

ner_pg

Sleeping

App Files Files Community

Kaelan commited on Nov 9, 2023

Commit

a197a13

1 Parent(s): cd3ed14

Add application file

Browse files

Files changed (15) hide show

app.py +209 -0
dockerfile +24 -0
eval_35.csv +34 -0
requirements.txt +32 -0
src/__pycache__/app_utils.cpython-39.pyc +0 -0
src/__pycache__/inference.cpython-39.pyc +0 -0
src/__pycache__/model_utils.cpython-39.pyc +0 -0
src/__pycache__/negation.cpython-39.pyc +0 -0
src/__pycache__/trainers.cpython-39.pyc +0 -0
src/app_utils.py +175 -0
src/inference.py +33 -0
src/model_utils.py +111 -0
src/negation.py +78 -0
src/trainers.py +168 -0
st_config.yaml +23 -0

app.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import streamlit as st
+from annotated_text import annotated_text
+import pandas as pd
+import yaml
+import os
+from src.negation import *
+from src.app_utils import *
+from src.inference import inference
+from src.trainers import eval_spacy
+#### Loading configuration and models ####
+with open('./st_config.yaml', "r") as yamlfile:
+    args = yaml.load(yamlfile, Loader=yaml.FullLoader)
+if args['model_dir'] is None:
+    model_names_dir = []
+elif os.path.exists(args['model_dir']):
+    model_names_dir = os.listdir(args['model_dir'])
+else:
+    model_names_dir = []
+model_names = model_names_dir + args['default_models'] if args['default_models'] is not None else model_names_dir
+st.title('Radiology NER')
+st.markdown('This app is to experiment on using NER to extract span of text from radiological notes that will to indicate the current condition of the patient. The targeted extraction includes \n 1) the symptoms of the disease \n 2) the location of the organs affected the symptops  \n 3) and the progress of the disease. \nen_ner_bc5cdr_md is the base model already trained to detect diseases and chemicals. en_Radiology_ner_bc5cdr_md is fine tuned on the base model with additional entities to indicate "Existence or Worsening" of symptoms and "Absence or recovering" of symptoms. This will help practitioners to quickly note the key words in the clinical report as well as the tabulated result can be use for analysis for other downstream task')
+##################################
+####  sidebar (Chose Model) ######
+##################################
+model_name= st.sidebar.selectbox("Select a model", options=model_names)
+print(model_name)
+if len(model_names) > 0:
+    models = load_models(model_names,args, model_names_dir)
+    print(models)
+    selected_model = models[model_name]
+    print(selected_model)
+##################################
+####  sidebar (Chose Example) ####
+##################################
+st.sidebar.markdown('###')
+if args['examples'] is not None:
+    chosen_note = st.sidebar.selectbox("Select an example text", options=args['examples'].keys())
+else:
+    chosen_note = None
+if chosen_note == "radiology_eval_dataset":
+    text_input = pd.read_csv("./eval_35.csv",  converters={'entities': ast.literal_eval})
+    text_input = text_input.to_dict('records')
+# set colors for each entity
+if len(model_names) > 0:
+    ents_available = selected_model.get_pipe('ner').labels
+    print(ents_available)
+    ent_colors_map = dict(map(lambda i,j : (i,j) , ents_available,args['colors_palette'][:len(ents_available)]))
+##################
+###  Text area ###
+##################
+if chosen_note != "radiology_eval_dataset":
+    text_input = st.text_area("Type notes in the box below",
+	                  value=args['examples'][chosen_note] if args['examples'] is not None else '')
+st.markdown("---")
+############################
+### Side bar (Load Files)###
+############################
+st.sidebar.info('For csv & json files, name the text columns to be infered as "text". Annotated labels as "entities" Format of json text as below')
+st.sidebar.json([{"text":"example","entities":[[5,6,"do"],[8,11,"dx"]]},{"text":"example2","entities":[[5,6,"do"],[8,11,"dx"]]}],expanded=False)
+uploaded_file = st.sidebar.file_uploader("Upload a file", type=["csv","json","pdf", "txt"])
+text_input = process_files(uploaded_file, text_input)
+#################################
+### Side bar (Select Entities)###
+#################################
+selected_entities = st.sidebar.multiselect(
+                    "Select the entities you want to view",
+                    options=ents_available if len(model_names)> 0 else [],
+                    default=ents_available if len(model_names)> 0 else [],
+                    )
+##########################
+### Text Area (Slider)###
+##########################
+if (len(text_input)> 1) & (isinstance(text_input,(list,dict))):
+    sample = st.slider('Select Example', min_value=1, max_value=len(text_input))
+else:
+    sample = None
+# Process documents to tokens
+if len(model_names)>0:
+    infer_input = text_input[sample-1]["text"] if sample is not None else text_input
+    doc = selected_model(infer_input)
+    textcol_negate, textcol_compare = st.columns([1, 1])
+    # checkboxes for negation
+    negate = textcol_negate.checkbox('Check for Negation')
+    ##########################################
+    ### Checkboxes for Compare with labels ###
+    ##########################################
+    if (isinstance(text_input,(dict,list))):
+        if 'entities' in text_input[0].keys():
+            state_compare = False
+            compare = textcol_compare.checkbox('Compare between predictions and labels',disabled=state_compare)
+        else:
+            state_compare, compare = True, False
+    else:
+        state_compare, compare = True, False
+    ###############################
+    ### Processing for negation ###
+    ###############################
+    if negate:
+        neg_ent = {"ent_types":list(selected_model.get_pipe('ner').labels)}
+        neg = negation(selected_model, neg_ent)
+        doc = infer_negation(neg,selected_model,infer_input,doc)
+        selected_entities += ['NEG']
+        ent_colors_map.update({'NEG': '#C7C7C7'})
+    ################################
+    ### Processing for Comparision##
+    ################################
+    if compare & (isinstance(text_input,(dict,list))):
+            infer_input = text_input[sample-1]
+            tokens_compare = process_text_compare(infer_input,selected_entities,colors=ent_colors_map)
+    tokens = process_text(doc, selected_entities,colors=ent_colors_map)
+    st.markdown('##')
+    # Display results
+    st.markdown('#### Predictions')
+    annotated_text(*tokens)
+    if compare & (isinstance(text_input,(dict,list))):
+        st.markdown('#### Labels')
+        annotated_text(*tokens_compare)
+    st.markdown("---")
+    data = pd.DataFrame.from_dict([{'label': entity.label_, 'text': entity.text, 'start': entity.start, 'end': entity.end} \
+            for entity in doc.ents])
+    if data.shape[1]>0:
+        st.table(data['label'].value_counts())
+    myexpander = st.expander('Details on text')
+    myexpander.table(data)
+    ###################################
+    #### Inference on whole dataset####
+    ###################################
+    infer_whole_dataset = st.checkbox('Inference on whole dataset')
+    if (isinstance(text_input,(dict,list))) & (infer_whole_dataset):
+        texts = []
+        for text in text_input:
+            texts.append(text['text'])
+        st.markdown('### Prediction on whole dataset')
+        inference_data = inference(selected_model,texts)
+        ### Applying negation to whole dataset
+        if negate:
+            neg_ent = {"ent_types":list(selected_model.get_pipe('ner').labels)}
+            neg = negation(selected_model, neg_ent)
+            docs = selected_model.pipe(texts,batch_size=8)
+            records = []
+            for no,doc in enumerate(docs):
+                doc = infer_negation(neg,selected_model,texts[no],doc)
+                if len(doc.ents)>0:
+                    records.append([{'id':no+1,'text':doc.text,'span': entity.text,
+                            'entity': entity.label_, 'start': entity.start, 'end': entity.end}
+                            for entity in doc.ents])
+                else:
+                    records.append([{'id':no+1,'text':doc.text,'span': None,
+                        'entity': None, 'start':None, 'end': None}])
+            inference_data = pd.DataFrame.from_dict(sum(records,[])).set_index(['text','id'])
+        st.download_button(
+            label="Download Prediction as CSV",
+            data=inference_data.to_csv().encode('utf-8'),
+            file_name='inference_data.csv',
+            mime='text/csv',
+        )
+        ########################################
+        ### Expander for dataframe and report###
+        ########################################
+        report_expander = st.expander('Report on Evaluation Results')
+        results_metrics = eval_spacy(selected_model,text_input)
+        overall_score = pd.DataFrame.from_dict({'Type':['Overall'],'Precision': [results_metrics['ents_p']],
+                                          'Recall': [results_metrics['ents_r']],
+                                          'F1': [results_metrics['ents_f']]})
+        overall_score = overall_score.set_index('Type')
+        entities_score = pd.DataFrame.from_dict(results_metrics['ents_per_type']).T
+        entities_score = entities_score.rename(columns={'p':'Precision','r':'Recall','f':'F1'})
+        report_expander.table(overall_score)
+        report_expander.table(entities_score)
+        df_expander = st.expander('Inference Table')
+        df_expander.write(inference_data.to_html(), unsafe_allow_html=True)
+        #df_expander.table(inference_data)

dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.9.16-slim-buster
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    git \
+    && pip install --upgrade pip \
+    && apt clean && rm -rf /var/lib/apt/lists/*
+COPY . /app
+RUN pip3 install -r requirements.txt
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+WORKDIR /app
+RUN mkdir ./models && chmod 777 ./models
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

eval_35.csv ADDED Viewed

	@@ -0,0 +1,34 @@

+text,entities
+A 30 year old female patient with a past medical history of asthma  morbid obesity  BMI 39 5   and hypertension on an angiotensin converting enzyme  ACE  inhibitor presented with a 6 day history of fever  Tmax 38 9°C   cough  and shortness of breath  Laboratory studies were remarkable for lymphopenia  0 6×103 µL  normal range 0 9×103 µL – 3 3×103 µL   elevated serum creatinine  1 3 mg dL  normal range 0 6 mg dL – 1 2 mg dL   elevated aspartate aminotransferase  73 IU L  normal range 13 IU L – 39 IU L   elevated c reactive protein  8 6 mg dL  normal range 0 – 1 mg dL   elevated procalcitonin  2 39 ng mL  normal   0 1 ng mL   elevated interleukin 6  197 pg mL  normal ≤ 5 pg mL   elevated cardiac troponin I  142 ng L  normal   15 ng L   and mildly elevated d dimer  570 ng mL  normal   500 ng mL   She reported a history of contact with a COVID positive co worker and no recent travel  Influenza A B RT PCR were negative  She developed acute respiratory distress and was emergently intubated  Prone portable PA chest X ray on second day of admission demonstrates persistent airspace opacities  cardiomegaly and haziness of the cardiac borders  Fig  3  ,"[[60, 66, 'DX'], [99, 111, 'DX'], [198, 203, 'DX'], [219, 224, 'DX'], [230, 249, 'DX'], [290, 301, 'DX'], [846, 851, 'DX'], [943, 969, 'DX'], [1070, 1099, 'EXIST_WORSEN'], [1101, 1113, 'EXIST_WORSEN'], [1118, 1149, 'EXIST_WORSEN']]"
+A 29 year old immunocompromised female patient with a 3 day history of cough and fever  Past medical history includes severe ulcerative colitis treated with Tofacitinib  The patient was admitted to the hospital ward and discharged one week after admission with complete recovery  Chest X ray  Increase of parenchymal opacity in right lower lobe ,"[[71, 76, 'DX'], [81, 86, 'DX'], [261, 278, 'ABST_RECOVER'], [305, 344, 'EXIST_WORSEN']]"
+79 year old woman who presented with chest pain  cough  and fever for 3 days  Coronavirus disease  COVID 19  had recently been diagnosed in two of her household members  Patient developed acute respiratory distress syndrome within subsequent few days and died 11 days after admission   Courtesy of Song F  Shanghai Public Health Clinical Center  Shanghai  China    show ground glass opacification  GGO  on day 1 ,"[[37, 47, 'DX'], [49, 54, 'DX'], [60, 65, 'DX'], [78, 89, 'DX'], [99, 107, 'DX'], [188, 214, 'DX'], [370, 396, 'EXIST_WORSEN'], [398, 401, 'EXIST_WORSEN']]"
+79 year old woman who presented with chest pain  cough  and fever for 3 days  Coronavirus disease  COVID 19  had recently been diagnosed in two of her household members  Patient developed acute respiratory distress syndrome within subsequent few days and died 11 days after admission   Courtesy of Song F  Shanghai Public Health Clinical Center  Shanghai  China    obtained on day 4 show GGO has progressed to airspace consolidation ,"[[37, 47, 'DX'], [49, 54, 'DX'], [60, 65, 'DX'], [78, 89, 'DX'], [99, 107, 'DX'], [188, 214, 'DX'], [388, 391, 'EXIST_WORSEN'], [410, 432, 'EXIST_WORSEN']]"
+History 73 year old male with aorta insufficiency and pacemaker was admitted to the hospital with fever and coughing after being in an area with Corona   X findings day 1  normal findings day 4  bilateral consolidations intubated  day 8  bilateral consolidation day 13  extubation  PCR positive  Follow up Extubated after 9 days of mechanical ventilation ,"[[98, 103, 'DX'], [108, 116, 'DX'], [195, 219, 'EXIST_WORSEN'], [238, 261, 'EXIST_WORSEN']]"
+50 year old man was sent to the fever clinic for fever  chills  cough  fatigue  and shortness of breath  He reported the travel history of Wuhan from January 8 to 12  and the first symptoms appeared on January 14  the first day of onset   manifested as mild chills and dry cough  But the patient continued to work until going to the hospital on January 21  Figure 1   The patient underwent a chest radiograph and a pharyngeal swab in the hospital  The chest radiograph showed multiple patchy images of both lungs  Appendix p2   On January 22  the 9th day of onset   He was immediately transferred to the isolation ward  and oxygen was given through the mask for oxygen support  Interferon alpha 2b  aerosol inhalation of 5 million U  bid  and lopinavir   ritonavir  500 mg  bid   Po  were used as antiviral treatment  and moxifloxacin  0 4 g  qd    ivgtt  to prevent secondary infections  Given its severe shortness of breath and hypoxemia  methylprednisolone  80 mg  bid   Ivgtt  was given to reduce lung inflammation  The laboratory test results are listed in the appendix  p4   After receiving medication  the patient's body temperature dropped from 39 0 ° C to 36 4 ° C ,"[[32, 37, 'DX'], [49, 54, 'DX'], [56, 62, 'DX'], [64, 69, 'DX'], [84, 103, 'DX'], [258, 264, 'DX'], [269, 278, 'DX'], [476, 512, 'EXIST_WORSEN'], [899, 925, 'DX'], [930, 939, 'DX']]"
+chest film normal on admission to hospital,"[[0, 20, 'ABST_RECOVER']]"
+patient on mechanical ventilation with bilateral consolidations on the chest film,"[[34, 76, 'EXIST_WORSEN']]"
+Chest film of a 83 year old male with mitral insufficiency  pulmonary hypertension and atrial fibrillation with COVID 19 infection  Ground glass opacification and consolidation in right upper lobe and left lower lobe  arrows  ,"[[60, 82, 'DX'], [112, 120, 'DX'], [132, 216, 'EXIST_WORSEN']]"
+Within a few hours after presentation on the ER the patient became hypoxic and was treated with mechanical ventilation  Later that day the patient was transferred to another hospital  History  64 year old male with fever and coughing for 2 weeks after a skiing holiday with his family  CT findings  Widespread GGO in all lobes  Crazy paving  blue arrows   Vascular enlargement  black arrow   Subpleural bands with retraction  yellow arrows   Consolidation and bronchiectasis posteriorly in the lower lobes  CORADS 5   very high suspicion of COVID 19  PCR positive,"[[67, 74, 'DX'], [215, 220, 'DX'], [225, 233, 'DX'], [299, 326, 'EXIST_WORSEN'], [356, 376, 'EXIST_WORSEN'], [392, 424, 'EXIST_WORSEN'], [460, 505, 'EXIST_WORSEN'], [541, 549, 'DX'], [551, 563, 'DX']]"
+83 year old male with mitral insufficiency and pulmonary hypertension was diagnosed with COVID 19 infection  The chest film shows consolidation in the right upper lobe  green arrow  and probably some consolidation in the left lower lobe  The patient decided not to be treat with mechanical ventilation and died four days later ,"[[47, 69, 'DX'], [89, 97, 'DX'], [130, 167, 'EXIST_WORSEN'], [200, 236, 'EXIST_WORSEN']]"
+Day 1  normal findings  History  73 year old male with aorta insufficiency and pacemaker was admitted to the hospital with fever and coughing after being in an area with COVID 19  PCR positive  Follow up  extubated after 9 days of mechanical ventilation ,"[[55, 74, 'DX'], [123, 128, 'DX'], [133, 141, 'DX'], [170, 178, 'DX'], [180, 192, 'DX']]"
+Day 4  bilateral consolidations intubated  History  73 year old male with aorta insufficiency and pacemaker was admitted to the hospital with fever and coughing after being in an area with COVID 19  PCR positive  Follow up  extubated after 9 days of mechanical ventilation ,"[[7, 31, 'EXIST_WORSEN'], [74, 93, 'DX'], [142, 147, 'DX'], [152, 160, 'DX'], [189, 197, 'DX'], [199, 211, 'DX']]"
+Day 8  bilateral consolidation  History  73 year old male with aorta insufficiency and pacemaker was admitted to the hospital with fever and coughing after being in an area with COVID 19  PCR positive  Follow up  extubated after 9 days of mechanical ventilation ,"[[7, 30, 'EXIST_WORSEN'], [63, 82, 'DX'], [131, 136, 'DX'], [141, 149, 'DX'], [178, 186, 'DX'], [188, 200, 'DX']]"
+Day 13  extubation  History  73 year old male with aorta insufficiency and pacemaker was admitted to the hospital with fever and coughing after being in an area with COVID 19  PCR positive  Follow up  extubated after 9 days of mechanical ventilation ,"[[51, 70, 'DX'], [119, 124, 'DX'], [129, 137, 'DX'], [166, 174, 'DX'], [176, 188, 'DX']]"
+72 year old female came to the hospital with sore throat  cough  dyspnea  anosmia and fever for 5 days  Physical exam revealed no pathological findings  Biochemistry showed lymphopenia  decreased prothrombin activity  c reactive protein increase and hypoxemia  RT PCR was positive for COVID 19  No co morbidities or risk factors were communicated  AP chest X Ray  a reticular nodular pattern in both lungs  mostly in the right one  was observed  In addition  mild opacities in the superior middle and lower right lobes were depicted ,"[[45, 56, 'DX'], [58, 63, 'DX'], [65, 72, 'DX'], [74, 81, 'DX'], [86, 91, 'DX'], [173, 184, 'DX'], [250, 259, 'DX'], [285, 293, 'DX'], [366, 405, 'EXIST_WORSEN'], [459, 518, 'EXIST_WORSEN']]"
+A 72 year old female patient with a history of ischaemic stroke  ocular myasthenia  arterial hyper tension  and hypercholesterolaemia was admitted to the emergency department because of dyspnoea  She reported having fever and cough for a week  At admission  her pulse oximeter saturation was 84   the tympanic temperature was 37 6 °C  Laboratory findings revealed elevated C reactive protein  19 69 mg dL  normal range 0 01 0 5 mg dL  and mild lymphopenia  0 7X10 3 mm 3  normal range 1 0 4 0 X10 3 mm 3   The patient also underwent non contrast chest CT  AP chest X ray obtained on the second day of admission demonstrated diffuse bilateral opacities  tracheal cannula  na sogastric tube  internal jugular CVC,"[[186, 194, 'DX'], [216, 221, 'DX'], [226, 231, 'DX'], [444, 455, 'DX'], [624, 651, 'EXIST_WORSEN']]"
+A 74 year old woman with history of hypertension and heart disease  who had been discharged 10 days before knee prosthetic surgery  was admitted with 4 day history of fever  dry cough and dyspnoea  She had not left home since discharge and no family member was affected  Analysis revealed lymphopenia  elevation of C reactive protein and a positive RT PCR  The patient was admitted to the intensive care unit  with a favourable course  Chest X ray at admission showed diffuse reticular pattern with small opacities in both basal regions,"[[36, 48, 'DX'], [53, 66, 'DX'], [167, 172, 'DX'], [174, 183, 'DX'], [188, 196, 'DX'], [289, 300, 'DX'], [468, 536, 'EXIST_WORSEN']]"
+A 74 year old woman with history of hypertension and heart disease  who had been discharged 10 days before knee prosthetic surgery  was admitted with 4 day history of fever  dry cough and dyspnoea  She had not left home since discharge and no family member was affected  Analysis revealed lymphopenia  elevation of C reactive protein and a positive RT PCR  The patient was admitted to the intensive care unit  with a favourable course  Chest X ray on the second day showed diffuse reticular pattern and increased density in both lungs,"[[36, 48, 'DX'], [53, 66, 'DX'], [167, 172, 'DX'], [174, 183, 'DX'], [188, 196, 'DX'], [289, 300, 'DX'], [473, 534, 'EXIST_WORSEN']]"
+A 74 year old woman with history of hypertension and heart disease  who had been discharged 10 days before knee prosthetic surgery  was admitted with 4 day history of fever  dry cough and dyspnoea  She had not left home since discharge and no family member was affected  Analysis revealed lymphopenia  elevation of C reactive protein and a positive RT PCR  The patient was admitted to the intensive care unit  with a favourable course  Chest x ray on the eighth day showed improvement with decreased of high density and reticular pattern  more evident in the upper left lobe ,"[[36, 48, 'DX'], [53, 66, 'DX'], [167, 172, 'DX'], [174, 183, 'DX'], [188, 196, 'DX'], [289, 300, 'DX'], [473, 574, 'ABST_RECOVER']]"
+A sixty five year old woman presented to the emergency department with a 5 day history of nausea and diarrhoea and a 2 day onset of non productive cough and asthenia  without fever  Her husband had similar symptoms  and both had no epidemiological context for COVID 19 infection  She had type 2 diabetes mellitus  arterial hypertension and chronic renal disease  Both were positive on RT PCR test for COVID 19  Anteroposterior chest x ray of a patient infected with COVID 19 that shows consolidations,"[[90, 96, 'DX'], [101, 110, 'DX'], [132, 152, 'DX'], [157, 165, 'DX'], [175, 180, 'DX'], [260, 268, 'DX'], [401, 409, 'DX'], [466, 474, 'DX'], [480, 500, 'EXIST_WORSEN']]"
+showing interstitial alveolar hypodiaphania of the middle basal field on the left and basal seat on the right  which is associated with pleural veiling on the left  ,"[[30, 81, 'EXIST_WORSEN'], [136, 163, 'EXIST_WORSEN']]"
+Softened confluent densities with peripheral distribution with associated interstitial weft thickening  No pleural effusion  Thickening with frosted glass with peripheral distribution and associated thickening of the interlobular septa  absence of pleural effusion and in the absence of significant ilo mediastinal lymphadenopathies characterize the TC pattern  highly suggestive of CoViD 19  then found later with pharyngeal swab ,"[[9, 57, 'EXIST_WORSEN'], [74, 102, 'EXIST_WORSEN'], [104, 123, 'ABST_RECOVER'], [199, 235, 'EXIST_WORSEN'], [237, 264, 'ABST_RECOVER'], [276, 314, 'ABST_RECOVER'], [315, 332, 'EXIST_WORSEN'], [383, 391, 'DX']]"
+posterior bilateral interstitial engagement  at the base of the alveolar consolidation area with air bronchograms and moderate concomitant pleural effusion  The X ray examination shows nuanced parenchymal thickening in the middle and lower field in the right hemithorax and in the middle field on the left ,"[[64, 86, 'EXIST_WORSEN'], [97, 113, 'EXIST_WORSEN'], [118, 155, 'EXIST_WORSEN'], [185, 215, 'EXIST_WORSEN']]"
+Fever  cough and shortness of breath on arrival patient saturation of oxygen was 75   There is peripheral patchy air space opacification seen in both lung lower zones with diffuse ground glass haze bilaterally  This is the initial plain film  raising suspicion of COVID 19 pneumonia  RT PCR was sent which turned out to be positive  The patient was referred to a COVID 19 dedicated center for further treatment ,"[[0, 5, 'DX'], [7, 12, 'DX'], [17, 36, 'DX'], [95, 160, 'EXIST_WORSEN'], [264, 272, 'DX'], [273, 282, 'DX'], [363, 371, 'DX']]"
+Fever  dry cough and dyspnea for few days   Multiple peripheral opacifications  throughout both lungs   ,"[[0, 5, 'DX'], [7, 16, 'DX'], [21, 28, 'DX'], [44, 101, 'EXIST_WORSEN']]"
+Moderate amount of mid zone airspace opacification in both mid zones with a peripheral predominance ,"[[19, 68, 'EXIST_WORSEN']]"
+just stepped down from HDU  New oxygen requirements  Extensive bilateral airspace opacification in both lungs  more pronounced on the right and with relative sparing of the left upper lobe   The airspace opacification has a peripheral distribution   No pleural effusions  ,"[[63, 109, 'EXIST_WORSEN'], [195, 247, 'EXIST_WORSEN'], [250, 270, 'ABST_RECOVER']]"
+ITU admission  Endotracheal tube  nasogastric tube and right internal jugular lines suitable sited   Bilateral airspace opacification persists  but it has partially regressed since the prior radiograph ,"[[101, 174, 'ABST_RECOVER']]"
+Lines and tubes suitably sited   Minor regression in the appearances of the lungs from the radiograph of 2 days earlier ,"[[33, 81, 'ABST_RECOVER']]"
+increasing oxygen requirements  Extubated   Positive pressure ventilation mask in use   Widespread bilateral airspace opacification in both lungs   No longer is the distribution peripheral or sparing the apices   No pleural effusions or lobar consolidation ,"[[88, 145, 'EXIST_WORSEN'], [213, 256, 'ABST_RECOVER']]"
+Extubated since the prior radiograph   Partial regression of the diffuse lungs changes  however air bronchograms are now evident in both upper lobes ,"[[39, 78, 'ABST_RECOVER'], [96, 148, 'EXIST_WORSEN']]"
+Remarkable improvement in appearances since the radiograph 4 days earlier     The current appearances of the lungs are nearly normal and better than the day 1 admission appearances ,"[[0, 37, 'ABST_RECOVER'], [105, 132, 'ABST_RECOVER']]"

requirements.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+typing-extensions<4.6.0
+altair==4.0
+streamlit==1.18.1
+keras==2.11.0
+matplotlib==3.7.0
+nltk==3.8.1
+numpy==1.24.2
+pandas==1.5.3
+plac==1.3.5
+PyPDF2==3.0.1
+scikit-learn==1.2.1
+spacy==3.4.1
+#spacy==3.5.0
+spacy-transformers==1.1.2
+#spacy-transformers==1.2.2
+spacy-alignments==0.9.0
+spacy-legacy==3.0.12
+spacy-loggers==1.0.4
+spacy-lookups-data==1.0.3
+st-annotated-text
+tensorflow==2.11.0
+tensorflow-estimator==2.11.0
+thinc==8.1.7
+tokenizers
+torch==1.11.0
+tqdm==4.64.1
+transformers
+negspacy==1.0.3
+#en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.1/en_core_web_trf-3.4.1-py3-none-any.whl
+https://huggingface.co/Kaelan/en_Radiology_ner_bc5cdr_md/resolve/main/en_Radiology_ner_bc5cdr_md-any-py3-none-any.whl
+#https://huggingface.co/Kaelan/en_Radiology_ClinicalBert_Ner/resolve/main/en_Radiology_ClinicalBert_Ner-any-py3-none-any.whl
+https://huggingface.co/Kaelan/en_ner_bc5cdr_md/resolve/main/en_ner_bc5cdr_md-any-py3-none-any.whl

src/__pycache__/app_utils.cpython-39.pyc ADDED Viewed

Binary file (5.02 kB). View file

src/__pycache__/inference.cpython-39.pyc ADDED Viewed

Binary file (1.16 kB). View file

src/__pycache__/model_utils.cpython-39.pyc ADDED Viewed

Binary file (2.79 kB). View file

src/__pycache__/negation.cpython-39.pyc ADDED Viewed

Binary file (2.51 kB). View file

src/__pycache__/trainers.cpython-39.pyc ADDED Viewed

Binary file (4.9 kB). View file

src/app_utils.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import spacy
+import streamlit as st
+import pandas as pd
+from PyPDF2 import PdfReader
+from io import StringIO
+import json
+import warnings
+import os
+import ast
+@st.cache(show_spinner=False, allow_output_mutation=True, suppress_st_warning=True)
+#@st.cache_resource
+def load_models(model_names: list, args: dict, model_names_dir: list)-> dict:
+    """
+    Check if model name refers to fine tuned models that are located in the model_dir or
+    default models native to spacy. Load them according to required methods
+    Parameters:
+        model_names: list of model names for inference
+        args: dict, configuration parameters
+        model_names_dir: list of model that are from the model_names_dir which are fine tuned models
+    Returns:
+        model_dict: A dictionary of keys representing the model names and values containing the model.
+    """
+    assert (model_names is not None) or (len(model_names)!=0), "No models avaliable"
+    model_dict = {}
+    for model_name in model_names:
+        print(model_name)
+        # loading model from directory
+        if model_name in model_names_dir:
+            try:
+                model_path = os.path.join(args['model_dir'], model_name)
+                model = spacy.load(model_path)
+            except:
+                warnings.warn(f"Path to {model_name} not found")
+        else:
+            try:
+                #load default models from spacy
+                model = spacy.load(model_name)
+            except:
+                warnings.warn(f'Model: {model_name} not found')
+        model_dict.update({model_name:model})
+        print('Model loaded')
+    return model_dict
+def process_text(doc: spacy, selected_entities: list,colors: list)-> list:
+    """
+    This function is to process the tokens from the doc type output from spacy models such that tokens that
+    are grouped together by their corresponding entities. This allow the st-annotations to be processed
+    the tokens for visualization
+    Example: "Hi John, i am sick with cough and flu"
+    Entities: person , disease
+    Output: [(Hi)(John, 'person', blue)(i am sick)(cough, 'disease', red)(and)(flu, 'disease', red)]
+    Parameters:
+        doc : spacy document
+        selected_entities : list of entities
+        colors : list of colors
+    Returns:
+        tokens: list of tuples
+    """
+    tokens = []
+    span = ''
+    p_ent = None
+    last = len(doc)
+    for no, token in enumerate(doc):
+        add_span = False
+        for ent in selected_entities:
+            if (token.ent_type_ == ent) & (ent in selected_entities):
+                span += token.text + " "
+                p_ent = ent
+                add_span = True
+                if no+1 == last:
+                    tokens.append((span, ent, colors[ent],'#464646'))
+        if (add_span is False) & (len(span) >1):
+                tokens.append((span, p_ent, colors[p_ent],'#464646'))
+                span = ''
+                p_ent = None
+        if add_span is False:
+            tokens.append(" " + token.text + " ")
+    return tokens
+def process_text_compare(infer_input: dict, selected_entities: list, colors: list)-> list:
+    """
+    This function is use when user is looking to compare the text annotations between the prediction and
+     labels. This function is to process the tokens from evaluation data such that tokens that
+    are grouped together by their corresponding entities. This allow the st-annotations to be processed
+    the tokens for visualization
+    Example: "Hi John, i am sick with cough and flu"
+    Entities: person , disease
+    Output: [(Hi)(John, 'person', blue)(i am sick)(cough, 'disease', red)(and)(flu, 'disease', red)]
+    Parameters:
+        infer_input : spacy document
+        selected_entities : list of entities
+        colors : list of colors
+    Returns:
+        tokens: list of tuples
+    """
+    tokens = []
+    start_=0
+    end_= len(infer_input['text'])
+    for start, end, entities in infer_input['entities']:
+        if entities in selected_entities:
+            # get the span of words that match the entities detected
+            span = infer_input['text'][start:end+1]
+            # get the span of words that don't match the entities
+            if start_ != start:
+                b4_span = infer_input['text'][start_:start]
+                tokens.append(" " + b4_span + " ")
+            tokens.append((span, entities, colors[entities],'#464646'))
+            start_=end
+    if start_ <= end_:
+        span = infer_input['text'][start_:end_+1]
+        tokens.append(" " + span + " ")
+    return tokens
+def process_files(uploaded_file, text_input):
+    """
+    As the app allows uploading files of mutiple files types, at present
+    such as json, csv, pdf and txt format.
+    The function is to detect what kind of file has been uploaded and process
+    the files accordingly.
+    If file has been uplaoded it will replace existing text_input
+    Parameters:
+        uploaded_file: The UploadedFile class is a subclass of BytesIO, and therefore it is "file-like".
+        text_input: str / dict /list
+    Return:
+        text_input: list / dict / str
+    """
+    if uploaded_file is not None:
+        if uploaded_file.name[-3:]=='csv':
+            # literal_eval to eval a string of list into actual list obj
+            text_input = pd.read_csv(uploaded_file,  converters={'entities': ast.literal_eval})
+            text_input = text_input.to_dict('records')
+        elif uploaded_file.name[-3:]=='son':
+                text_input = json.load(uploaded_file)
+        else:
+            try:
+                text_input = ""
+                stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
+                for line in stringio.readlines():
+                    text_input += line + "\n"
+                #text_input = text_input.decode("utf-8", errors='strict')
+            except:
+                text_input = []
+                reader = PdfReader(uploaded_file)
+                count = len(reader.pages)
+                # read all the pages of a pdf
+                for i in range(count):
+                    pages = reader.pages[i]
+                    text_input.append(pages.extract_text())
+                text_input = ''.join(text_input)
+    return text_input

src/inference.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import spacy
+import pandas as pd
+def inference(model: spacy, texts: list, batch_size: int=8):
+    """
+    To perform batch inferencing
+    Parameters:
+        model: type of model
+        texts: input text example
+        batch_size: batch size of the inference
+    Returns:
+        data: pandas.DataFrame of the output from inference
+    """
+    docs = model.pipe(texts,batch_size=batch_size)
+    records = []
+    for no, doc in enumerate(docs):
+        if len(doc.ents)>0:
+            records.append([{'id':no+1,'text':doc.text,'span': entity.text,
+                            'entity': entity.label_, 'start': entity.start, 'end': entity.end}
+                            for entity in doc.ents])
+        else:
+            records.append([{'id':no+1,'text':doc.text,'span': None,
+                'entity': None, 'start':None, 'end': None}])
+    data = pd.DataFrame.from_dict(sum(records,[])).set_index(['text','id'])
+    return data

src/model_utils.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import json
+from pathlib import Path
+import spacy
+from spacy.training import Example
+def make_training_doc(nlp: spacy, data: list):
+    """
+    To convert data into spacy doc type that can be use for training
+    parameters:
+        nlp: model
+        data: training data
+    returns:
+        trainiing_data: list of spacy doc
+    """
+    training_data = []
+    for text, annotations in data:
+        doc = nlp.make_doc(text)
+        example = Example.from_dict(doc, annotations)
+        training_data.append(example)
+    return training_data
+def load_model(model: str=None):
+    """
+    Load the model indicated by model
+    parameters:
+        model: str , name of the model to load
+    returns:
+        nlp: spacy model object
+        optimizer : the optimizer to be use in training
+    """
+    if model is not None:
+        nlp = spacy.load(model)  # load existing spaCy model
+        print("Loaded model '%s'" % model)
+        optimizer = nlp.resume_training()
+    else:
+        nlp = spacy.blank('en')  # create blank Language class
+        print("Created blank 'en' model")
+        optimizer = nlp.begin_training()
+    return nlp, optimizer
+def save_model(model: spacy, output_dir: str):
+    """
+    Save the model to the output_dir
+    parameters:
+        model: spacy model
+        output_dir: path
+    """
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        model.to_disk(output_dir)
+        print("Saved model to", output_dir)
+    return None
+def load_data(args):
+    """
+    Load training data, evaluation data as well as entities dictionary
+    parameters:
+        args: dict, configuration from the config file
+    returns:
+        train_dict, entities_dict, eval_dict
+    """
+    assert args['train_dir'] != None, 'indicate path for training directory'
+    # Load the training data
+    with open(args['train_dir']) as f:
+        train_dict = json.load(f)
+        print('Loaded Training Data')
+    try:
+        entities_dict=train_dict[args['ent_key']]
+        print('Loaded Entities from Training Data')
+    except KeyError:
+        entities_dict=None
+        print('No classes for entities found in data loaded. Proceed to check in ent_dir')
+    # Load entities
+    if args['ent_dir'] is not None and entities_dict is None:
+        with open(args['ent_dir']) as f:
+            entities_dict = json.load(f)
+            entities_dict = entities_dict[args['ent_key']]
+            print('Loaded Entities from ent_dir')
+    elif args['ent_dir'] is None and entities_dict is None:
+        assert entities_dict != None, 'No entities found from training_dir & ent_dir'
+    # Load eval data
+    if args['eval_dir'] is not None:
+        with open(args['eval_dir']) as f:
+            eval_dict = json.load(f)
+            print('Loaded Evaluating Data')
+    else:
+        return train_dict, entities_dict, None
+    return train_dict, entities_dict, eval_dict

src/negation.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import spacy
+from negspacy.negation import Negex
+from spacy.matcher import PhraseMatcher
+from spacy.tokens import Span
+def negation(model: spacy, entities: list):
+    """
+    Take in the current model pipeline and add in Negation model.
+    Add in entities to the negation model
+    Parameters:
+        model: spacy model
+        entities: list of entities
+    Returns:
+        model: spacy model with Negation added to the pipeline
+    """
+    if 'parser' in model.pipe_names:
+        model.remove_pipe('parser')
+    #nlp.add_pipe(nlp.create_pipe('sentencizer'))
+    if 'sentencizer' not in model.pipe_names:
+        model.add_pipe('sentencizer')
+    #negex = Negex(nlp)
+    if 'negex' not in model.pipe_names:
+        model.add_pipe('negex',config=entities)
+    return model
+def infer_negation(neg_model: spacy, model: spacy, text: str ,pred_doc: spacy):
+    """
+    To match results from the negation model with the results from the model.
+    Replace the entity type of the spans or tokens in the predictions doc
+    that should be negated with entity type "NEG".
+    Parameters:
+        neg_model: spacy negation model
+        model: spacy model
+        text: text sample
+        pred_doc: prediction of the text sample from model
+    Returns:
+        pred_doc: spacy doc with all entities that should be negated replaced with the "NEG" entity type
+    """
+    doc = neg_model(text)
+    results = {'ent':[],'start':[], 'end':[]}
+    for e in doc.ents:
+        rs = str(e._.negex)
+        if rs == "True":
+            results['ent'].append(e.text)
+            results['start'].append(e.start)
+            results['end'].append(e.end)
+    print('Negation: ', results)
+    patterns = [model.make_doc(text) for text in results['ent']]
+    matcher = PhraseMatcher(model.vocab)
+    matcher.add('NEG', None, *patterns)
+    # match all the tokens or span of text detected to be negated with the prediction doc.
+    matches = matcher(pred_doc)
+    seen_tokens = set()
+    new_entities = []
+    entities = pred_doc.ents
+    # to get exact matches: not only the span or word matches but also location
+    for match in results['start']:
+        count = 0
+        for match_id, start, end in matches:
+            if match == start:
+                new_entities.append(Span(pred_doc, start, end, label=match_id))
+                entities = [
+                    e for e in entities if not (e.start < end and e.end > start)
+                ]
+                seen_tokens.update(range(start, end))
+                matches.pop(count)
+            count += 1
+    pred_doc.ents = tuple(entities) + tuple(new_entities)
+    return pred_doc

src/trainers.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import spacy
+from spacy.util import minibatch, compounding
+from spacy.scorer import Scorer
+from src.model_utils import *
+import random
+from tqdm import tqdm
+def train_transformer(config: dict, train_data: list, components: list, iter: int,
+                      batch_size: int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data: list=None)-> spacy:
+    """
+    Finetune a transformer model or resume training from a fine-tuned model.
+    Parameters:
+        config: dict, configuration parameters
+        train_data: list, contain training data
+        components: list, list of components to be trained
+        iter: int, number of iterations to train
+        batch_size: int, batch size to be used for training
+        entities: list of entities to be trained on for NER
+        eval_data: list, containing evaluation data
+    Returns:
+        nlp : spacy transformer
+        losses: list  of  the losses at every iteration
+    """
+    if config['dir'] is not None:
+        nlp = spacy.load(config['dir'])
+        optimizer = nlp.resume_training()
+    else:
+        nlp = spacy.blank("en") # empty English pipeline
+        nlp.add_pipe("transformer", config=config['config'])
+        for component in components:
+            nlp.add_pipe(component)
+            task=nlp.get_pipe(component)
+            if ('ner' in components) and (entities is not None):
+                for label in entities:
+                    task.add_label(label)
+        nlp.initialize() # XXX don't forget this step!
+        optimizer = nlp.create_optimizer()
+    # convert data into training doc
+    train_data_doc = make_training_doc(nlp, train_data)
+    all_losses = []
+    for itn in tqdm(range(1,iter+1)):
+        print("Starting iteration " + str(itn))
+        random.shuffle(train_data)
+        losses = {}
+        # compounding(4.0, 32.0, 1.001)
+        batches = minibatch(train_data_doc, size=batch_size)
+        for batch in batches:
+            nlp.update(batch, sgd=optimizer,drop=0.2, losses=losses)
+        scores = eval_spacy(nlp, eval_data) if eval_data else  eval_spacy(nlp, train_data)
+        print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
+            format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))
+        all_losses.append([losses[component] for component in components])
+    return nlp, all_losses
+def train_spacy(model: spacy, train_data: list, components: list, iter: int,
+                batch_size:int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data:list=None)-> spacy:
+    """
+    Finetune a spacy model or resume training from a fine-tuned model.
+    Parameters:
+        model: str, name of spacy model
+        train_data: list, contain training data
+        components: list, list of components to be trained
+        iter: int, number of iterations to train
+        batch_size: int, batch size to be used for training
+        entities: list of entities to be trained on for NER
+        eval_data: list, containing evaluation data
+    Returns:
+        nlp : spacy model
+        losses: list  of  the losses at every iteration
+    """
+    # get model and optimizer
+    if model is not None:
+        nlp, optimizer = load_model(model)  # load existing spaCy model/ blank models
+    # convert data into training doc
+    train_data_doc = make_training_doc(nlp, train_data)
+    # create the built-in pipeline components and add them to the pipeline
+    # nlp.create_pipe works for built-ins that are registered with spaCy
+    for component in components:
+        if component not in nlp.pipe_names:
+            ner = nlp.create_pipe(component)
+            nlp.add_pipe(component, last=True)
+        else:
+            ner = nlp.get_pipe(component)
+            # add labels if component is NER
+        if (component == 'ner') and (entities is not None):
+            for ent in entities:
+                    ner.add_label(ent)
+    print(f'Entities in the model are: {nlp.get_pipe("ner").labels}')
+    # get names of other pipes to disable them during training
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in components]
+    all_losses = []
+    with nlp.disable_pipes(*other_pipes):  # only train NER
+        for itn in tqdm(range(1,iter+1)):
+            print("Starting iteration " + str(itn))
+            random.shuffle(train_data)
+            losses = {}
+            batches = minibatch(train_data_doc, size=batch_size)
+            for batch in batches:
+                nlp.update(list(batch),
+                          losses=losses,
+                          drop=0.1,
+                          sgd=optimizer)
+            scores = eval_spacy(nlp, eval_data) if eval_data else  eval_spacy(nlp, train_data)
+            print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
+                format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))
+            all_losses.append([losses[component] for component in components])
+    return nlp, all_losses
+def eval_spacy(model: spacy, data):
+    """
+    Function to perform evaluation and scoring
+    Parameters:
+        model: either a spacy model or spacy transformer
+        data: evaluation data so that scoring can be done
+    Returns:
+        score: dict with scores of the model
+    """
+    scorer = Scorer()
+    examples = []
+    try:
+        # accept spacy format json data
+        for input_, annot in data:
+            doc = model.make_doc(input_)
+            example = Example.from_dict(doc, annot)
+            example.predicted = model(str(example.text))
+            examples.append(example)
+        scores = scorer.score(examples)
+        return scores
+    except TypeError:
+        # accept alternative format json data
+        for row in data:
+            input_, annot = row.values()
+            doc = model.make_doc(input_)
+            example = Example.from_dict(doc, {'entities':annot})
+            example.predicted = model(str(example.text))
+            examples.append(example)
+        scores = scorer.score(examples)
+        return scores
+    except Exception as e: print(e)

st_config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+#"/app/models/push"
+model_dir: ""
+default_models: ['en_Radiology_ner_bc5cdr_md','en_ner_bc5cdr_md']
+examples:
+    radiology_eval_dataset: "eval_35.csv"
+    clinical note A:
+              "A 74 year old woman with history of hypertension and heart disease  who had \
+              been discharged 10 days before knee prosthetic surgery  was admitted with 4 day \
+              history of fever  dry cough and dyspnoea. She had not left home since discharge \
+              and no family member was affected. Analysis revealed lymphopenia elevation of C \
+              reactive protein and a positive RT PCR. The patient was admitted to the intensive \
+              care unit with a favourable course. Chest X ray on the second day showed diffuse \
+              reticular pattern and increased density in both lungs"
+    Clinical note B:
+             "A 29 year old immunocompromised female patient with a 3 day history of cough and fever. \
+              Past medical history includes severe ulcerative colitis treated with Tofacitinib \
+              The patient was admitted to the hospital ward and discharged one week after admission \
+              with complete recovery. Chest X ray shows increase of parenchymal opacity in right lower lobe"
+    negation:
+             "I am still have cough however there is absence of fever. i do not have any headache nor \
+              stomachache but i do have SARS and COVID 19"
+colors_palette: ["#99A3A4" ,"#bc4ed8","#FF5733","#54e96b","#2ce6f5","#f23fc4","#f3a53a",'#7FB3D5',"#EC7063","#F4D03F"]