Kaelan
commited on
Commit
·
a197a13
1
Parent(s):
cd3ed14
Add application file
Browse files- app.py +209 -0
- dockerfile +24 -0
- eval_35.csv +34 -0
- requirements.txt +32 -0
- src/__pycache__/app_utils.cpython-39.pyc +0 -0
- src/__pycache__/inference.cpython-39.pyc +0 -0
- src/__pycache__/model_utils.cpython-39.pyc +0 -0
- src/__pycache__/negation.cpython-39.pyc +0 -0
- src/__pycache__/trainers.cpython-39.pyc +0 -0
- src/app_utils.py +175 -0
- src/inference.py +33 -0
- src/model_utils.py +111 -0
- src/negation.py +78 -0
- src/trainers.py +168 -0
- st_config.yaml +23 -0
app.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from annotated_text import annotated_text
|
3 |
+
import pandas as pd
|
4 |
+
import yaml
|
5 |
+
import os
|
6 |
+
|
7 |
+
from src.negation import *
|
8 |
+
from src.app_utils import *
|
9 |
+
from src.inference import inference
|
10 |
+
from src.trainers import eval_spacy
|
11 |
+
|
12 |
+
#### Loading configuration and models ####
|
13 |
+
|
14 |
+
with open('./st_config.yaml', "r") as yamlfile:
|
15 |
+
args = yaml.load(yamlfile, Loader=yaml.FullLoader)
|
16 |
+
|
17 |
+
if args['model_dir'] is None:
|
18 |
+
model_names_dir = []
|
19 |
+
elif os.path.exists(args['model_dir']):
|
20 |
+
model_names_dir = os.listdir(args['model_dir'])
|
21 |
+
else:
|
22 |
+
model_names_dir = []
|
23 |
+
|
24 |
+
|
25 |
+
model_names = model_names_dir + args['default_models'] if args['default_models'] is not None else model_names_dir
|
26 |
+
|
27 |
+
st.title('Radiology NER')
|
28 |
+
st.markdown('This app is to experiment on using NER to extract span of text from radiological notes that will to indicate the current condition of the patient. The targeted extraction includes \n 1) the symptoms of the disease \n 2) the location of the organs affected the symptops \n 3) and the progress of the disease. \nen_ner_bc5cdr_md is the base model already trained to detect diseases and chemicals. en_Radiology_ner_bc5cdr_md is fine tuned on the base model with additional entities to indicate "Existence or Worsening" of symptoms and "Absence or recovering" of symptoms. This will help practitioners to quickly note the key words in the clinical report as well as the tabulated result can be use for analysis for other downstream task')
|
29 |
+
|
30 |
+
|
31 |
+
##################################
|
32 |
+
#### sidebar (Chose Model) ######
|
33 |
+
##################################
|
34 |
+
model_name= st.sidebar.selectbox("Select a model", options=model_names)
|
35 |
+
print(model_name)
|
36 |
+
if len(model_names) > 0:
|
37 |
+
models = load_models(model_names,args, model_names_dir)
|
38 |
+
print(models)
|
39 |
+
selected_model = models[model_name]
|
40 |
+
print(selected_model)
|
41 |
+
|
42 |
+
##################################
|
43 |
+
#### sidebar (Chose Example) ####
|
44 |
+
##################################
|
45 |
+
st.sidebar.markdown('###')
|
46 |
+
if args['examples'] is not None:
|
47 |
+
chosen_note = st.sidebar.selectbox("Select an example text", options=args['examples'].keys())
|
48 |
+
else:
|
49 |
+
chosen_note = None
|
50 |
+
|
51 |
+
if chosen_note == "radiology_eval_dataset":
|
52 |
+
text_input = pd.read_csv("./eval_35.csv", converters={'entities': ast.literal_eval})
|
53 |
+
text_input = text_input.to_dict('records')
|
54 |
+
|
55 |
+
|
56 |
+
# set colors for each entity
|
57 |
+
if len(model_names) > 0:
|
58 |
+
ents_available = selected_model.get_pipe('ner').labels
|
59 |
+
print(ents_available)
|
60 |
+
ent_colors_map = dict(map(lambda i,j : (i,j) , ents_available,args['colors_palette'][:len(ents_available)]))
|
61 |
+
|
62 |
+
|
63 |
+
##################
|
64 |
+
### Text area ###
|
65 |
+
##################
|
66 |
+
if chosen_note != "radiology_eval_dataset":
|
67 |
+
text_input = st.text_area("Type notes in the box below",
|
68 |
+
value=args['examples'][chosen_note] if args['examples'] is not None else '')
|
69 |
+
st.markdown("---")
|
70 |
+
|
71 |
+
############################
|
72 |
+
### Side bar (Load Files)###
|
73 |
+
############################
|
74 |
+
st.sidebar.info('For csv & json files, name the text columns to be infered as "text". Annotated labels as "entities" Format of json text as below')
|
75 |
+
st.sidebar.json([{"text":"example","entities":[[5,6,"do"],[8,11,"dx"]]},{"text":"example2","entities":[[5,6,"do"],[8,11,"dx"]]}],expanded=False)
|
76 |
+
uploaded_file = st.sidebar.file_uploader("Upload a file", type=["csv","json","pdf", "txt"])
|
77 |
+
text_input = process_files(uploaded_file, text_input)
|
78 |
+
|
79 |
+
#################################
|
80 |
+
### Side bar (Select Entities)###
|
81 |
+
#################################
|
82 |
+
selected_entities = st.sidebar.multiselect(
|
83 |
+
"Select the entities you want to view",
|
84 |
+
options=ents_available if len(model_names)> 0 else [],
|
85 |
+
default=ents_available if len(model_names)> 0 else [],
|
86 |
+
)
|
87 |
+
|
88 |
+
##########################
|
89 |
+
### Text Area (Slider)###
|
90 |
+
##########################
|
91 |
+
if (len(text_input)> 1) & (isinstance(text_input,(list,dict))):
|
92 |
+
sample = st.slider('Select Example', min_value=1, max_value=len(text_input))
|
93 |
+
else:
|
94 |
+
sample = None
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
# Process documents to tokens
|
99 |
+
if len(model_names)>0:
|
100 |
+
infer_input = text_input[sample-1]["text"] if sample is not None else text_input
|
101 |
+
doc = selected_model(infer_input)
|
102 |
+
|
103 |
+
textcol_negate, textcol_compare = st.columns([1, 1])
|
104 |
+
|
105 |
+
# checkboxes for negation
|
106 |
+
negate = textcol_negate.checkbox('Check for Negation')
|
107 |
+
|
108 |
+
##########################################
|
109 |
+
### Checkboxes for Compare with labels ###
|
110 |
+
##########################################
|
111 |
+
if (isinstance(text_input,(dict,list))):
|
112 |
+
if 'entities' in text_input[0].keys():
|
113 |
+
state_compare = False
|
114 |
+
compare = textcol_compare.checkbox('Compare between predictions and labels',disabled=state_compare)
|
115 |
+
else:
|
116 |
+
state_compare, compare = True, False
|
117 |
+
else:
|
118 |
+
state_compare, compare = True, False
|
119 |
+
|
120 |
+
###############################
|
121 |
+
### Processing for negation ###
|
122 |
+
###############################
|
123 |
+
if negate:
|
124 |
+
neg_ent = {"ent_types":list(selected_model.get_pipe('ner').labels)}
|
125 |
+
neg = negation(selected_model, neg_ent)
|
126 |
+
doc = infer_negation(neg,selected_model,infer_input,doc)
|
127 |
+
selected_entities += ['NEG']
|
128 |
+
ent_colors_map.update({'NEG': '#C7C7C7'})
|
129 |
+
|
130 |
+
################################
|
131 |
+
### Processing for Comparision##
|
132 |
+
################################
|
133 |
+
if compare & (isinstance(text_input,(dict,list))):
|
134 |
+
infer_input = text_input[sample-1]
|
135 |
+
tokens_compare = process_text_compare(infer_input,selected_entities,colors=ent_colors_map)
|
136 |
+
|
137 |
+
tokens = process_text(doc, selected_entities,colors=ent_colors_map)
|
138 |
+
|
139 |
+
st.markdown('##')
|
140 |
+
# Display results
|
141 |
+
st.markdown('#### Predictions')
|
142 |
+
annotated_text(*tokens)
|
143 |
+
|
144 |
+
if compare & (isinstance(text_input,(dict,list))):
|
145 |
+
st.markdown('#### Labels')
|
146 |
+
annotated_text(*tokens_compare)
|
147 |
+
|
148 |
+
st.markdown("---")
|
149 |
+
data = pd.DataFrame.from_dict([{'label': entity.label_, 'text': entity.text, 'start': entity.start, 'end': entity.end} \
|
150 |
+
for entity in doc.ents])
|
151 |
+
if data.shape[1]>0:
|
152 |
+
st.table(data['label'].value_counts())
|
153 |
+
myexpander = st.expander('Details on text')
|
154 |
+
myexpander.table(data)
|
155 |
+
|
156 |
+
###################################
|
157 |
+
#### Inference on whole dataset####
|
158 |
+
###################################
|
159 |
+
infer_whole_dataset = st.checkbox('Inference on whole dataset')
|
160 |
+
if (isinstance(text_input,(dict,list))) & (infer_whole_dataset):
|
161 |
+
texts = []
|
162 |
+
for text in text_input:
|
163 |
+
texts.append(text['text'])
|
164 |
+
|
165 |
+
st.markdown('### Prediction on whole dataset')
|
166 |
+
inference_data = inference(selected_model,texts)
|
167 |
+
|
168 |
+
### Applying negation to whole dataset
|
169 |
+
if negate:
|
170 |
+
neg_ent = {"ent_types":list(selected_model.get_pipe('ner').labels)}
|
171 |
+
neg = negation(selected_model, neg_ent)
|
172 |
+
docs = selected_model.pipe(texts,batch_size=8)
|
173 |
+
|
174 |
+
records = []
|
175 |
+
for no,doc in enumerate(docs):
|
176 |
+
doc = infer_negation(neg,selected_model,texts[no],doc)
|
177 |
+
if len(doc.ents)>0:
|
178 |
+
records.append([{'id':no+1,'text':doc.text,'span': entity.text,
|
179 |
+
'entity': entity.label_, 'start': entity.start, 'end': entity.end}
|
180 |
+
for entity in doc.ents])
|
181 |
+
else:
|
182 |
+
records.append([{'id':no+1,'text':doc.text,'span': None,
|
183 |
+
'entity': None, 'start':None, 'end': None}])
|
184 |
+
|
185 |
+
inference_data = pd.DataFrame.from_dict(sum(records,[])).set_index(['text','id'])
|
186 |
+
|
187 |
+
st.download_button(
|
188 |
+
label="Download Prediction as CSV",
|
189 |
+
data=inference_data.to_csv().encode('utf-8'),
|
190 |
+
file_name='inference_data.csv',
|
191 |
+
mime='text/csv',
|
192 |
+
)
|
193 |
+
########################################
|
194 |
+
### Expander for dataframe and report###
|
195 |
+
########################################
|
196 |
+
report_expander = st.expander('Report on Evaluation Results')
|
197 |
+
results_metrics = eval_spacy(selected_model,text_input)
|
198 |
+
overall_score = pd.DataFrame.from_dict({'Type':['Overall'],'Precision': [results_metrics['ents_p']],
|
199 |
+
'Recall': [results_metrics['ents_r']],
|
200 |
+
'F1': [results_metrics['ents_f']]})
|
201 |
+
overall_score = overall_score.set_index('Type')
|
202 |
+
entities_score = pd.DataFrame.from_dict(results_metrics['ents_per_type']).T
|
203 |
+
entities_score = entities_score.rename(columns={'p':'Precision','r':'Recall','f':'F1'})
|
204 |
+
report_expander.table(overall_score)
|
205 |
+
report_expander.table(entities_score)
|
206 |
+
|
207 |
+
df_expander = st.expander('Inference Table')
|
208 |
+
df_expander.write(inference_data.to_html(), unsafe_allow_html=True)
|
209 |
+
#df_expander.table(inference_data)
|
dockerfile
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9.16-slim-buster
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
RUN apt-get update && apt-get install -y \
|
6 |
+
build-essential \
|
7 |
+
curl \
|
8 |
+
software-properties-common \
|
9 |
+
git \
|
10 |
+
&& pip install --upgrade pip \
|
11 |
+
&& apt clean && rm -rf /var/lib/apt/lists/*
|
12 |
+
|
13 |
+
COPY . /app
|
14 |
+
|
15 |
+
RUN pip3 install -r requirements.txt
|
16 |
+
|
17 |
+
EXPOSE 8501
|
18 |
+
|
19 |
+
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
20 |
+
|
21 |
+
WORKDIR /app
|
22 |
+
RUN mkdir ./models && chmod 777 ./models
|
23 |
+
|
24 |
+
ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
eval_35.csv
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
text,entities
|
2 |
+
A 30 year old female patient with a past medical history of asthma morbid obesity BMI 39 5 and hypertension on an angiotensin converting enzyme ACE inhibitor presented with a 6 day history of fever Tmax 38 9°C cough and shortness of breath Laboratory studies were remarkable for lymphopenia 0 6×103 µL normal range 0 9×103 µL – 3 3×103 µL elevated serum creatinine 1 3 mg dL normal range 0 6 mg dL – 1 2 mg dL elevated aspartate aminotransferase 73 IU L normal range 13 IU L – 39 IU L elevated c reactive protein 8 6 mg dL normal range 0 – 1 mg dL elevated procalcitonin 2 39 ng mL normal 0 1 ng mL elevated interleukin 6 197 pg mL normal ≤ 5 pg mL elevated cardiac troponin I 142 ng L normal 15 ng L and mildly elevated d dimer 570 ng mL normal 500 ng mL She reported a history of contact with a COVID positive co worker and no recent travel Influenza A B RT PCR were negative She developed acute respiratory distress and was emergently intubated Prone portable PA chest X ray on second day of admission demonstrates persistent airspace opacities cardiomegaly and haziness of the cardiac borders Fig 3 ,"[[60, 66, 'DX'], [99, 111, 'DX'], [198, 203, 'DX'], [219, 224, 'DX'], [230, 249, 'DX'], [290, 301, 'DX'], [846, 851, 'DX'], [943, 969, 'DX'], [1070, 1099, 'EXIST_WORSEN'], [1101, 1113, 'EXIST_WORSEN'], [1118, 1149, 'EXIST_WORSEN']]"
|
3 |
+
A 29 year old immunocompromised female patient with a 3 day history of cough and fever Past medical history includes severe ulcerative colitis treated with Tofacitinib The patient was admitted to the hospital ward and discharged one week after admission with complete recovery Chest X ray Increase of parenchymal opacity in right lower lobe ,"[[71, 76, 'DX'], [81, 86, 'DX'], [261, 278, 'ABST_RECOVER'], [305, 344, 'EXIST_WORSEN']]"
|
4 |
+
79 year old woman who presented with chest pain cough and fever for 3 days Coronavirus disease COVID 19 had recently been diagnosed in two of her household members Patient developed acute respiratory distress syndrome within subsequent few days and died 11 days after admission Courtesy of Song F Shanghai Public Health Clinical Center Shanghai China show ground glass opacification GGO on day 1 ,"[[37, 47, 'DX'], [49, 54, 'DX'], [60, 65, 'DX'], [78, 89, 'DX'], [99, 107, 'DX'], [188, 214, 'DX'], [370, 396, 'EXIST_WORSEN'], [398, 401, 'EXIST_WORSEN']]"
|
5 |
+
79 year old woman who presented with chest pain cough and fever for 3 days Coronavirus disease COVID 19 had recently been diagnosed in two of her household members Patient developed acute respiratory distress syndrome within subsequent few days and died 11 days after admission Courtesy of Song F Shanghai Public Health Clinical Center Shanghai China obtained on day 4 show GGO has progressed to airspace consolidation ,"[[37, 47, 'DX'], [49, 54, 'DX'], [60, 65, 'DX'], [78, 89, 'DX'], [99, 107, 'DX'], [188, 214, 'DX'], [388, 391, 'EXIST_WORSEN'], [410, 432, 'EXIST_WORSEN']]"
|
6 |
+
History 73 year old male with aorta insufficiency and pacemaker was admitted to the hospital with fever and coughing after being in an area with Corona X findings day 1 normal findings day 4 bilateral consolidations intubated day 8 bilateral consolidation day 13 extubation PCR positive Follow up Extubated after 9 days of mechanical ventilation ,"[[98, 103, 'DX'], [108, 116, 'DX'], [195, 219, 'EXIST_WORSEN'], [238, 261, 'EXIST_WORSEN']]"
|
7 |
+
50 year old man was sent to the fever clinic for fever chills cough fatigue and shortness of breath He reported the travel history of Wuhan from January 8 to 12 and the first symptoms appeared on January 14 the first day of onset manifested as mild chills and dry cough But the patient continued to work until going to the hospital on January 21 Figure 1 The patient underwent a chest radiograph and a pharyngeal swab in the hospital The chest radiograph showed multiple patchy images of both lungs Appendix p2 On January 22 the 9th day of onset He was immediately transferred to the isolation ward and oxygen was given through the mask for oxygen support Interferon alpha 2b aerosol inhalation of 5 million U bid and lopinavir ritonavir 500 mg bid Po were used as antiviral treatment and moxifloxacin 0 4 g qd ivgtt to prevent secondary infections Given its severe shortness of breath and hypoxemia methylprednisolone 80 mg bid Ivgtt was given to reduce lung inflammation The laboratory test results are listed in the appendix p4 After receiving medication the patient's body temperature dropped from 39 0 ° C to 36 4 ° C ,"[[32, 37, 'DX'], [49, 54, 'DX'], [56, 62, 'DX'], [64, 69, 'DX'], [84, 103, 'DX'], [258, 264, 'DX'], [269, 278, 'DX'], [476, 512, 'EXIST_WORSEN'], [899, 925, 'DX'], [930, 939, 'DX']]"
|
8 |
+
chest film normal on admission to hospital,"[[0, 20, 'ABST_RECOVER']]"
|
9 |
+
patient on mechanical ventilation with bilateral consolidations on the chest film,"[[34, 76, 'EXIST_WORSEN']]"
|
10 |
+
Chest film of a 83 year old male with mitral insufficiency pulmonary hypertension and atrial fibrillation with COVID 19 infection Ground glass opacification and consolidation in right upper lobe and left lower lobe arrows ,"[[60, 82, 'DX'], [112, 120, 'DX'], [132, 216, 'EXIST_WORSEN']]"
|
11 |
+
Within a few hours after presentation on the ER the patient became hypoxic and was treated with mechanical ventilation Later that day the patient was transferred to another hospital History 64 year old male with fever and coughing for 2 weeks after a skiing holiday with his family CT findings Widespread GGO in all lobes Crazy paving blue arrows Vascular enlargement black arrow Subpleural bands with retraction yellow arrows Consolidation and bronchiectasis posteriorly in the lower lobes CORADS 5 very high suspicion of COVID 19 PCR positive,"[[67, 74, 'DX'], [215, 220, 'DX'], [225, 233, 'DX'], [299, 326, 'EXIST_WORSEN'], [356, 376, 'EXIST_WORSEN'], [392, 424, 'EXIST_WORSEN'], [460, 505, 'EXIST_WORSEN'], [541, 549, 'DX'], [551, 563, 'DX']]"
|
12 |
+
83 year old male with mitral insufficiency and pulmonary hypertension was diagnosed with COVID 19 infection The chest film shows consolidation in the right upper lobe green arrow and probably some consolidation in the left lower lobe The patient decided not to be treat with mechanical ventilation and died four days later ,"[[47, 69, 'DX'], [89, 97, 'DX'], [130, 167, 'EXIST_WORSEN'], [200, 236, 'EXIST_WORSEN']]"
|
13 |
+
Day 1 normal findings History 73 year old male with aorta insufficiency and pacemaker was admitted to the hospital with fever and coughing after being in an area with COVID 19 PCR positive Follow up extubated after 9 days of mechanical ventilation ,"[[55, 74, 'DX'], [123, 128, 'DX'], [133, 141, 'DX'], [170, 178, 'DX'], [180, 192, 'DX']]"
|
14 |
+
Day 4 bilateral consolidations intubated History 73 year old male with aorta insufficiency and pacemaker was admitted to the hospital with fever and coughing after being in an area with COVID 19 PCR positive Follow up extubated after 9 days of mechanical ventilation ,"[[7, 31, 'EXIST_WORSEN'], [74, 93, 'DX'], [142, 147, 'DX'], [152, 160, 'DX'], [189, 197, 'DX'], [199, 211, 'DX']]"
|
15 |
+
Day 8 bilateral consolidation History 73 year old male with aorta insufficiency and pacemaker was admitted to the hospital with fever and coughing after being in an area with COVID 19 PCR positive Follow up extubated after 9 days of mechanical ventilation ,"[[7, 30, 'EXIST_WORSEN'], [63, 82, 'DX'], [131, 136, 'DX'], [141, 149, 'DX'], [178, 186, 'DX'], [188, 200, 'DX']]"
|
16 |
+
Day 13 extubation History 73 year old male with aorta insufficiency and pacemaker was admitted to the hospital with fever and coughing after being in an area with COVID 19 PCR positive Follow up extubated after 9 days of mechanical ventilation ,"[[51, 70, 'DX'], [119, 124, 'DX'], [129, 137, 'DX'], [166, 174, 'DX'], [176, 188, 'DX']]"
|
17 |
+
72 year old female came to the hospital with sore throat cough dyspnea anosmia and fever for 5 days Physical exam revealed no pathological findings Biochemistry showed lymphopenia decreased prothrombin activity c reactive protein increase and hypoxemia RT PCR was positive for COVID 19 No co morbidities or risk factors were communicated AP chest X Ray a reticular nodular pattern in both lungs mostly in the right one was observed In addition mild opacities in the superior middle and lower right lobes were depicted ,"[[45, 56, 'DX'], [58, 63, 'DX'], [65, 72, 'DX'], [74, 81, 'DX'], [86, 91, 'DX'], [173, 184, 'DX'], [250, 259, 'DX'], [285, 293, 'DX'], [366, 405, 'EXIST_WORSEN'], [459, 518, 'EXIST_WORSEN']]"
|
18 |
+
A 72 year old female patient with a history of ischaemic stroke ocular myasthenia arterial hyper tension and hypercholesterolaemia was admitted to the emergency department because of dyspnoea She reported having fever and cough for a week At admission her pulse oximeter saturation was 84 the tympanic temperature was 37 6 °C Laboratory findings revealed elevated C reactive protein 19 69 mg dL normal range 0 01 0 5 mg dL and mild lymphopenia 0 7X10 3 mm 3 normal range 1 0 4 0 X10 3 mm 3 The patient also underwent non contrast chest CT AP chest X ray obtained on the second day of admission demonstrated diffuse bilateral opacities tracheal cannula na sogastric tube internal jugular CVC,"[[186, 194, 'DX'], [216, 221, 'DX'], [226, 231, 'DX'], [444, 455, 'DX'], [624, 651, 'EXIST_WORSEN']]"
|
19 |
+
A 74 year old woman with history of hypertension and heart disease who had been discharged 10 days before knee prosthetic surgery was admitted with 4 day history of fever dry cough and dyspnoea She had not left home since discharge and no family member was affected Analysis revealed lymphopenia elevation of C reactive protein and a positive RT PCR The patient was admitted to the intensive care unit with a favourable course Chest X ray at admission showed diffuse reticular pattern with small opacities in both basal regions,"[[36, 48, 'DX'], [53, 66, 'DX'], [167, 172, 'DX'], [174, 183, 'DX'], [188, 196, 'DX'], [289, 300, 'DX'], [468, 536, 'EXIST_WORSEN']]"
|
20 |
+
A 74 year old woman with history of hypertension and heart disease who had been discharged 10 days before knee prosthetic surgery was admitted with 4 day history of fever dry cough and dyspnoea She had not left home since discharge and no family member was affected Analysis revealed lymphopenia elevation of C reactive protein and a positive RT PCR The patient was admitted to the intensive care unit with a favourable course Chest X ray on the second day showed diffuse reticular pattern and increased density in both lungs,"[[36, 48, 'DX'], [53, 66, 'DX'], [167, 172, 'DX'], [174, 183, 'DX'], [188, 196, 'DX'], [289, 300, 'DX'], [473, 534, 'EXIST_WORSEN']]"
|
21 |
+
A 74 year old woman with history of hypertension and heart disease who had been discharged 10 days before knee prosthetic surgery was admitted with 4 day history of fever dry cough and dyspnoea She had not left home since discharge and no family member was affected Analysis revealed lymphopenia elevation of C reactive protein and a positive RT PCR The patient was admitted to the intensive care unit with a favourable course Chest x ray on the eighth day showed improvement with decreased of high density and reticular pattern more evident in the upper left lobe ,"[[36, 48, 'DX'], [53, 66, 'DX'], [167, 172, 'DX'], [174, 183, 'DX'], [188, 196, 'DX'], [289, 300, 'DX'], [473, 574, 'ABST_RECOVER']]"
|
22 |
+
A sixty five year old woman presented to the emergency department with a 5 day history of nausea and diarrhoea and a 2 day onset of non productive cough and asthenia without fever Her husband had similar symptoms and both had no epidemiological context for COVID 19 infection She had type 2 diabetes mellitus arterial hypertension and chronic renal disease Both were positive on RT PCR test for COVID 19 Anteroposterior chest x ray of a patient infected with COVID 19 that shows consolidations,"[[90, 96, 'DX'], [101, 110, 'DX'], [132, 152, 'DX'], [157, 165, 'DX'], [175, 180, 'DX'], [260, 268, 'DX'], [401, 409, 'DX'], [466, 474, 'DX'], [480, 500, 'EXIST_WORSEN']]"
|
23 |
+
showing interstitial alveolar hypodiaphania of the middle basal field on the left and basal seat on the right which is associated with pleural veiling on the left ,"[[30, 81, 'EXIST_WORSEN'], [136, 163, 'EXIST_WORSEN']]"
|
24 |
+
Softened confluent densities with peripheral distribution with associated interstitial weft thickening No pleural effusion Thickening with frosted glass with peripheral distribution and associated thickening of the interlobular septa absence of pleural effusion and in the absence of significant ilo mediastinal lymphadenopathies characterize the TC pattern highly suggestive of CoViD 19 then found later with pharyngeal swab ,"[[9, 57, 'EXIST_WORSEN'], [74, 102, 'EXIST_WORSEN'], [104, 123, 'ABST_RECOVER'], [199, 235, 'EXIST_WORSEN'], [237, 264, 'ABST_RECOVER'], [276, 314, 'ABST_RECOVER'], [315, 332, 'EXIST_WORSEN'], [383, 391, 'DX']]"
|
25 |
+
posterior bilateral interstitial engagement at the base of the alveolar consolidation area with air bronchograms and moderate concomitant pleural effusion The X ray examination shows nuanced parenchymal thickening in the middle and lower field in the right hemithorax and in the middle field on the left ,"[[64, 86, 'EXIST_WORSEN'], [97, 113, 'EXIST_WORSEN'], [118, 155, 'EXIST_WORSEN'], [185, 215, 'EXIST_WORSEN']]"
|
26 |
+
Fever cough and shortness of breath on arrival patient saturation of oxygen was 75 There is peripheral patchy air space opacification seen in both lung lower zones with diffuse ground glass haze bilaterally This is the initial plain film raising suspicion of COVID 19 pneumonia RT PCR was sent which turned out to be positive The patient was referred to a COVID 19 dedicated center for further treatment ,"[[0, 5, 'DX'], [7, 12, 'DX'], [17, 36, 'DX'], [95, 160, 'EXIST_WORSEN'], [264, 272, 'DX'], [273, 282, 'DX'], [363, 371, 'DX']]"
|
27 |
+
Fever dry cough and dyspnea for few days Multiple peripheral opacifications throughout both lungs ,"[[0, 5, 'DX'], [7, 16, 'DX'], [21, 28, 'DX'], [44, 101, 'EXIST_WORSEN']]"
|
28 |
+
Moderate amount of mid zone airspace opacification in both mid zones with a peripheral predominance ,"[[19, 68, 'EXIST_WORSEN']]"
|
29 |
+
just stepped down from HDU New oxygen requirements Extensive bilateral airspace opacification in both lungs more pronounced on the right and with relative sparing of the left upper lobe The airspace opacification has a peripheral distribution No pleural effusions ,"[[63, 109, 'EXIST_WORSEN'], [195, 247, 'EXIST_WORSEN'], [250, 270, 'ABST_RECOVER']]"
|
30 |
+
ITU admission Endotracheal tube nasogastric tube and right internal jugular lines suitable sited Bilateral airspace opacification persists but it has partially regressed since the prior radiograph ,"[[101, 174, 'ABST_RECOVER']]"
|
31 |
+
Lines and tubes suitably sited Minor regression in the appearances of the lungs from the radiograph of 2 days earlier ,"[[33, 81, 'ABST_RECOVER']]"
|
32 |
+
increasing oxygen requirements Extubated Positive pressure ventilation mask in use Widespread bilateral airspace opacification in both lungs No longer is the distribution peripheral or sparing the apices No pleural effusions or lobar consolidation ,"[[88, 145, 'EXIST_WORSEN'], [213, 256, 'ABST_RECOVER']]"
|
33 |
+
Extubated since the prior radiograph Partial regression of the diffuse lungs changes however air bronchograms are now evident in both upper lobes ,"[[39, 78, 'ABST_RECOVER'], [96, 148, 'EXIST_WORSEN']]"
|
34 |
+
Remarkable improvement in appearances since the radiograph 4 days earlier The current appearances of the lungs are nearly normal and better than the day 1 admission appearances ,"[[0, 37, 'ABST_RECOVER'], [105, 132, 'ABST_RECOVER']]"
|
requirements.txt
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
typing-extensions<4.6.0
|
2 |
+
altair==4.0
|
3 |
+
streamlit==1.18.1
|
4 |
+
keras==2.11.0
|
5 |
+
matplotlib==3.7.0
|
6 |
+
nltk==3.8.1
|
7 |
+
numpy==1.24.2
|
8 |
+
pandas==1.5.3
|
9 |
+
plac==1.3.5
|
10 |
+
PyPDF2==3.0.1
|
11 |
+
scikit-learn==1.2.1
|
12 |
+
spacy==3.4.1
|
13 |
+
#spacy==3.5.0
|
14 |
+
spacy-transformers==1.1.2
|
15 |
+
#spacy-transformers==1.2.2
|
16 |
+
spacy-alignments==0.9.0
|
17 |
+
spacy-legacy==3.0.12
|
18 |
+
spacy-loggers==1.0.4
|
19 |
+
spacy-lookups-data==1.0.3
|
20 |
+
st-annotated-text
|
21 |
+
tensorflow==2.11.0
|
22 |
+
tensorflow-estimator==2.11.0
|
23 |
+
thinc==8.1.7
|
24 |
+
tokenizers
|
25 |
+
torch==1.11.0
|
26 |
+
tqdm==4.64.1
|
27 |
+
transformers
|
28 |
+
negspacy==1.0.3
|
29 |
+
#en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.1/en_core_web_trf-3.4.1-py3-none-any.whl
|
30 |
+
https://huggingface.co/Kaelan/en_Radiology_ner_bc5cdr_md/resolve/main/en_Radiology_ner_bc5cdr_md-any-py3-none-any.whl
|
31 |
+
#https://huggingface.co/Kaelan/en_Radiology_ClinicalBert_Ner/resolve/main/en_Radiology_ClinicalBert_Ner-any-py3-none-any.whl
|
32 |
+
https://huggingface.co/Kaelan/en_ner_bc5cdr_md/resolve/main/en_ner_bc5cdr_md-any-py3-none-any.whl
|
src/__pycache__/app_utils.cpython-39.pyc
ADDED
Binary file (5.02 kB). View file
|
|
src/__pycache__/inference.cpython-39.pyc
ADDED
Binary file (1.16 kB). View file
|
|
src/__pycache__/model_utils.cpython-39.pyc
ADDED
Binary file (2.79 kB). View file
|
|
src/__pycache__/negation.cpython-39.pyc
ADDED
Binary file (2.51 kB). View file
|
|
src/__pycache__/trainers.cpython-39.pyc
ADDED
Binary file (4.9 kB). View file
|
|
src/app_utils.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
from PyPDF2 import PdfReader
|
6 |
+
from io import StringIO
|
7 |
+
import json
|
8 |
+
import warnings
|
9 |
+
import os
|
10 |
+
import ast
|
11 |
+
|
12 |
+
@st.cache(show_spinner=False, allow_output_mutation=True, suppress_st_warning=True)
|
13 |
+
#@st.cache_resource
|
14 |
+
def load_models(model_names: list, args: dict, model_names_dir: list)-> dict:
|
15 |
+
"""
|
16 |
+
Check if model name refers to fine tuned models that are located in the model_dir or
|
17 |
+
default models native to spacy. Load them according to required methods
|
18 |
+
|
19 |
+
Parameters:
|
20 |
+
model_names: list of model names for inference
|
21 |
+
args: dict, configuration parameters
|
22 |
+
model_names_dir: list of model that are from the model_names_dir which are fine tuned models
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
model_dict: A dictionary of keys representing the model names and values containing the model.
|
26 |
+
|
27 |
+
"""
|
28 |
+
assert (model_names is not None) or (len(model_names)!=0), "No models avaliable"
|
29 |
+
|
30 |
+
model_dict = {}
|
31 |
+
for model_name in model_names:
|
32 |
+
print(model_name)
|
33 |
+
# loading model from directory
|
34 |
+
if model_name in model_names_dir:
|
35 |
+
try:
|
36 |
+
model_path = os.path.join(args['model_dir'], model_name)
|
37 |
+
model = spacy.load(model_path)
|
38 |
+
except:
|
39 |
+
warnings.warn(f"Path to {model_name} not found")
|
40 |
+
else:
|
41 |
+
try:
|
42 |
+
#load default models from spacy
|
43 |
+
model = spacy.load(model_name)
|
44 |
+
except:
|
45 |
+
warnings.warn(f'Model: {model_name} not found')
|
46 |
+
model_dict.update({model_name:model})
|
47 |
+
print('Model loaded')
|
48 |
+
return model_dict
|
49 |
+
|
50 |
+
def process_text(doc: spacy, selected_entities: list,colors: list)-> list:
|
51 |
+
"""
|
52 |
+
This function is to process the tokens from the doc type output from spacy models such that tokens that
|
53 |
+
are grouped together by their corresponding entities. This allow the st-annotations to be processed
|
54 |
+
the tokens for visualization
|
55 |
+
|
56 |
+
Example: "Hi John, i am sick with cough and flu"
|
57 |
+
Entities: person , disease
|
58 |
+
Output: [(Hi)(John, 'person', blue)(i am sick)(cough, 'disease', red)(and)(flu, 'disease', red)]
|
59 |
+
|
60 |
+
Parameters:
|
61 |
+
doc : spacy document
|
62 |
+
selected_entities : list of entities
|
63 |
+
colors : list of colors
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
tokens: list of tuples
|
67 |
+
"""
|
68 |
+
tokens = []
|
69 |
+
span = ''
|
70 |
+
p_ent = None
|
71 |
+
last = len(doc)
|
72 |
+
for no, token in enumerate(doc):
|
73 |
+
add_span = False
|
74 |
+
for ent in selected_entities:
|
75 |
+
if (token.ent_type_ == ent) & (ent in selected_entities):
|
76 |
+
span += token.text + " "
|
77 |
+
p_ent = ent
|
78 |
+
add_span = True
|
79 |
+
if no+1 == last:
|
80 |
+
tokens.append((span, ent, colors[ent],'#464646'))
|
81 |
+
|
82 |
+
if (add_span is False) & (len(span) >1):
|
83 |
+
tokens.append((span, p_ent, colors[p_ent],'#464646'))
|
84 |
+
span = ''
|
85 |
+
p_ent = None
|
86 |
+
if add_span is False:
|
87 |
+
tokens.append(" " + token.text + " ")
|
88 |
+
|
89 |
+
return tokens
|
90 |
+
|
91 |
+
def process_text_compare(infer_input: dict, selected_entities: list, colors: list)-> list:
|
92 |
+
"""
|
93 |
+
This function is use when user is looking to compare the text annotations between the prediction and
|
94 |
+
labels. This function is to process the tokens from evaluation data such that tokens that
|
95 |
+
are grouped together by their corresponding entities. This allow the st-annotations to be processed
|
96 |
+
the tokens for visualization
|
97 |
+
|
98 |
+
Example: "Hi John, i am sick with cough and flu"
|
99 |
+
Entities: person , disease
|
100 |
+
Output: [(Hi)(John, 'person', blue)(i am sick)(cough, 'disease', red)(and)(flu, 'disease', red)]
|
101 |
+
|
102 |
+
Parameters:
|
103 |
+
infer_input : spacy document
|
104 |
+
selected_entities : list of entities
|
105 |
+
colors : list of colors
|
106 |
+
|
107 |
+
Returns:
|
108 |
+
tokens: list of tuples
|
109 |
+
|
110 |
+
"""
|
111 |
+
tokens = []
|
112 |
+
|
113 |
+
start_=0
|
114 |
+
end_= len(infer_input['text'])
|
115 |
+
|
116 |
+
for start, end, entities in infer_input['entities']:
|
117 |
+
if entities in selected_entities:
|
118 |
+
# get the span of words that match the entities detected
|
119 |
+
span = infer_input['text'][start:end+1]
|
120 |
+
# get the span of words that don't match the entities
|
121 |
+
if start_ != start:
|
122 |
+
b4_span = infer_input['text'][start_:start]
|
123 |
+
tokens.append(" " + b4_span + " ")
|
124 |
+
|
125 |
+
tokens.append((span, entities, colors[entities],'#464646'))
|
126 |
+
start_=end
|
127 |
+
|
128 |
+
if start_ <= end_:
|
129 |
+
span = infer_input['text'][start_:end_+1]
|
130 |
+
tokens.append(" " + span + " ")
|
131 |
+
return tokens
|
132 |
+
|
133 |
+
|
134 |
+
def process_files(uploaded_file, text_input):
|
135 |
+
"""
|
136 |
+
As the app allows uploading files of mutiple files types, at present
|
137 |
+
such as json, csv, pdf and txt format.
|
138 |
+
The function is to detect what kind of file has been uploaded and process
|
139 |
+
the files accordingly.
|
140 |
+
If file has been uplaoded it will replace existing text_input
|
141 |
+
|
142 |
+
Parameters:
|
143 |
+
uploaded_file: The UploadedFile class is a subclass of BytesIO, and therefore it is "file-like".
|
144 |
+
text_input: str / dict /list
|
145 |
+
|
146 |
+
Return:
|
147 |
+
text_input: list / dict / str
|
148 |
+
"""
|
149 |
+
if uploaded_file is not None:
|
150 |
+
if uploaded_file.name[-3:]=='csv':
|
151 |
+
# literal_eval to eval a string of list into actual list obj
|
152 |
+
text_input = pd.read_csv(uploaded_file, converters={'entities': ast.literal_eval})
|
153 |
+
text_input = text_input.to_dict('records')
|
154 |
+
|
155 |
+
elif uploaded_file.name[-3:]=='son':
|
156 |
+
text_input = json.load(uploaded_file)
|
157 |
+
else:
|
158 |
+
try:
|
159 |
+
text_input = ""
|
160 |
+
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
|
161 |
+
for line in stringio.readlines():
|
162 |
+
text_input += line + "\n"
|
163 |
+
#text_input = text_input.decode("utf-8", errors='strict')
|
164 |
+
except:
|
165 |
+
text_input = []
|
166 |
+
reader = PdfReader(uploaded_file)
|
167 |
+
count = len(reader.pages)
|
168 |
+
|
169 |
+
# read all the pages of a pdf
|
170 |
+
for i in range(count):
|
171 |
+
pages = reader.pages[i]
|
172 |
+
text_input.append(pages.extract_text())
|
173 |
+
text_input = ''.join(text_input)
|
174 |
+
|
175 |
+
return text_input
|
src/inference.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
def inference(model: spacy, texts: list, batch_size: int=8):
|
5 |
+
"""
|
6 |
+
To perform batch inferencing
|
7 |
+
|
8 |
+
Parameters:
|
9 |
+
model: type of model
|
10 |
+
texts: input text example
|
11 |
+
batch_size: batch size of the inference
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
data: pandas.DataFrame of the output from inference
|
15 |
+
"""
|
16 |
+
|
17 |
+
docs = model.pipe(texts,batch_size=batch_size)
|
18 |
+
|
19 |
+
records = []
|
20 |
+
for no, doc in enumerate(docs):
|
21 |
+
if len(doc.ents)>0:
|
22 |
+
records.append([{'id':no+1,'text':doc.text,'span': entity.text,
|
23 |
+
'entity': entity.label_, 'start': entity.start, 'end': entity.end}
|
24 |
+
for entity in doc.ents])
|
25 |
+
else:
|
26 |
+
records.append([{'id':no+1,'text':doc.text,'span': None,
|
27 |
+
'entity': None, 'start':None, 'end': None}])
|
28 |
+
|
29 |
+
data = pd.DataFrame.from_dict(sum(records,[])).set_index(['text','id'])
|
30 |
+
|
31 |
+
return data
|
32 |
+
|
33 |
+
|
src/model_utils.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import spacy
|
5 |
+
from spacy.training import Example
|
6 |
+
|
7 |
+
def make_training_doc(nlp: spacy, data: list):
|
8 |
+
"""
|
9 |
+
To convert data into spacy doc type that can be use for training
|
10 |
+
|
11 |
+
parameters:
|
12 |
+
nlp: model
|
13 |
+
data: training data
|
14 |
+
|
15 |
+
returns:
|
16 |
+
trainiing_data: list of spacy doc
|
17 |
+
"""
|
18 |
+
training_data = []
|
19 |
+
for text, annotations in data:
|
20 |
+
doc = nlp.make_doc(text)
|
21 |
+
example = Example.from_dict(doc, annotations)
|
22 |
+
training_data.append(example)
|
23 |
+
|
24 |
+
return training_data
|
25 |
+
|
26 |
+
|
27 |
+
def load_model(model: str=None):
|
28 |
+
"""
|
29 |
+
Load the model indicated by model
|
30 |
+
|
31 |
+
parameters:
|
32 |
+
model: str , name of the model to load
|
33 |
+
|
34 |
+
returns:
|
35 |
+
nlp: spacy model object
|
36 |
+
optimizer : the optimizer to be use in training
|
37 |
+
"""
|
38 |
+
if model is not None:
|
39 |
+
nlp = spacy.load(model) # load existing spaCy model
|
40 |
+
print("Loaded model '%s'" % model)
|
41 |
+
optimizer = nlp.resume_training()
|
42 |
+
else:
|
43 |
+
nlp = spacy.blank('en') # create blank Language class
|
44 |
+
print("Created blank 'en' model")
|
45 |
+
optimizer = nlp.begin_training()
|
46 |
+
|
47 |
+
return nlp, optimizer
|
48 |
+
|
49 |
+
|
50 |
+
def save_model(model: spacy, output_dir: str):
|
51 |
+
"""
|
52 |
+
Save the model to the output_dir
|
53 |
+
|
54 |
+
parameters:
|
55 |
+
model: spacy model
|
56 |
+
output_dir: path
|
57 |
+
"""
|
58 |
+
if output_dir is not None:
|
59 |
+
output_dir = Path(output_dir)
|
60 |
+
if not output_dir.exists():
|
61 |
+
output_dir.mkdir()
|
62 |
+
model.to_disk(output_dir)
|
63 |
+
print("Saved model to", output_dir)
|
64 |
+
|
65 |
+
return None
|
66 |
+
|
67 |
+
|
68 |
+
def load_data(args):
|
69 |
+
"""
|
70 |
+
Load training data, evaluation data as well as entities dictionary
|
71 |
+
|
72 |
+
parameters:
|
73 |
+
args: dict, configuration from the config file
|
74 |
+
|
75 |
+
returns:
|
76 |
+
train_dict, entities_dict, eval_dict
|
77 |
+
|
78 |
+
"""
|
79 |
+
|
80 |
+
assert args['train_dir'] != None, 'indicate path for training directory'
|
81 |
+
|
82 |
+
# Load the training data
|
83 |
+
with open(args['train_dir']) as f:
|
84 |
+
train_dict = json.load(f)
|
85 |
+
print('Loaded Training Data')
|
86 |
+
|
87 |
+
try:
|
88 |
+
entities_dict=train_dict[args['ent_key']]
|
89 |
+
print('Loaded Entities from Training Data')
|
90 |
+
except KeyError:
|
91 |
+
entities_dict=None
|
92 |
+
print('No classes for entities found in data loaded. Proceed to check in ent_dir')
|
93 |
+
|
94 |
+
# Load entities
|
95 |
+
if args['ent_dir'] is not None and entities_dict is None:
|
96 |
+
with open(args['ent_dir']) as f:
|
97 |
+
entities_dict = json.load(f)
|
98 |
+
entities_dict = entities_dict[args['ent_key']]
|
99 |
+
print('Loaded Entities from ent_dir')
|
100 |
+
elif args['ent_dir'] is None and entities_dict is None:
|
101 |
+
assert entities_dict != None, 'No entities found from training_dir & ent_dir'
|
102 |
+
|
103 |
+
# Load eval data
|
104 |
+
if args['eval_dir'] is not None:
|
105 |
+
with open(args['eval_dir']) as f:
|
106 |
+
eval_dict = json.load(f)
|
107 |
+
print('Loaded Evaluating Data')
|
108 |
+
else:
|
109 |
+
return train_dict, entities_dict, None
|
110 |
+
|
111 |
+
return train_dict, entities_dict, eval_dict
|
src/negation.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from negspacy.negation import Negex
|
3 |
+
from spacy.matcher import PhraseMatcher
|
4 |
+
from spacy.tokens import Span
|
5 |
+
|
6 |
+
def negation(model: spacy, entities: list):
|
7 |
+
"""
|
8 |
+
Take in the current model pipeline and add in Negation model.
|
9 |
+
Add in entities to the negation model
|
10 |
+
Parameters:
|
11 |
+
model: spacy model
|
12 |
+
entities: list of entities
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
model: spacy model with Negation added to the pipeline
|
16 |
+
"""
|
17 |
+
if 'parser' in model.pipe_names:
|
18 |
+
model.remove_pipe('parser')
|
19 |
+
#nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
20 |
+
if 'sentencizer' not in model.pipe_names:
|
21 |
+
model.add_pipe('sentencizer')
|
22 |
+
#negex = Negex(nlp)
|
23 |
+
if 'negex' not in model.pipe_names:
|
24 |
+
model.add_pipe('negex',config=entities)
|
25 |
+
|
26 |
+
return model
|
27 |
+
|
28 |
+
def infer_negation(neg_model: spacy, model: spacy, text: str ,pred_doc: spacy):
|
29 |
+
"""
|
30 |
+
To match results from the negation model with the results from the model.
|
31 |
+
Replace the entity type of the spans or tokens in the predictions doc
|
32 |
+
that should be negated with entity type "NEG".
|
33 |
+
|
34 |
+
Parameters:
|
35 |
+
neg_model: spacy negation model
|
36 |
+
model: spacy model
|
37 |
+
text: text sample
|
38 |
+
pred_doc: prediction of the text sample from model
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
pred_doc: spacy doc with all entities that should be negated replaced with the "NEG" entity type
|
42 |
+
|
43 |
+
"""
|
44 |
+
doc = neg_model(text)
|
45 |
+
results = {'ent':[],'start':[], 'end':[]}
|
46 |
+
for e in doc.ents:
|
47 |
+
rs = str(e._.negex)
|
48 |
+
if rs == "True":
|
49 |
+
results['ent'].append(e.text)
|
50 |
+
results['start'].append(e.start)
|
51 |
+
results['end'].append(e.end)
|
52 |
+
print('Negation: ', results)
|
53 |
+
|
54 |
+
patterns = [model.make_doc(text) for text in results['ent']]
|
55 |
+
matcher = PhraseMatcher(model.vocab)
|
56 |
+
matcher.add('NEG', None, *patterns)
|
57 |
+
|
58 |
+
# match all the tokens or span of text detected to be negated with the prediction doc.
|
59 |
+
matches = matcher(pred_doc)
|
60 |
+
seen_tokens = set()
|
61 |
+
new_entities = []
|
62 |
+
entities = pred_doc.ents
|
63 |
+
|
64 |
+
# to get exact matches: not only the span or word matches but also location
|
65 |
+
for match in results['start']:
|
66 |
+
count = 0
|
67 |
+
for match_id, start, end in matches:
|
68 |
+
if match == start:
|
69 |
+
new_entities.append(Span(pred_doc, start, end, label=match_id))
|
70 |
+
entities = [
|
71 |
+
e for e in entities if not (e.start < end and e.end > start)
|
72 |
+
]
|
73 |
+
seen_tokens.update(range(start, end))
|
74 |
+
matches.pop(count)
|
75 |
+
count += 1
|
76 |
+
pred_doc.ents = tuple(entities) + tuple(new_entities)
|
77 |
+
|
78 |
+
return pred_doc
|
src/trainers.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from spacy.util import minibatch, compounding
|
3 |
+
from spacy.scorer import Scorer
|
4 |
+
from src.model_utils import *
|
5 |
+
|
6 |
+
import random
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
def train_transformer(config: dict, train_data: list, components: list, iter: int,
|
10 |
+
batch_size: int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data: list=None)-> spacy:
|
11 |
+
"""
|
12 |
+
Finetune a transformer model or resume training from a fine-tuned model.
|
13 |
+
|
14 |
+
Parameters:
|
15 |
+
config: dict, configuration parameters
|
16 |
+
train_data: list, contain training data
|
17 |
+
components: list, list of components to be trained
|
18 |
+
iter: int, number of iterations to train
|
19 |
+
batch_size: int, batch size to be used for training
|
20 |
+
entities: list of entities to be trained on for NER
|
21 |
+
eval_data: list, containing evaluation data
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
nlp : spacy transformer
|
25 |
+
losses: list of the losses at every iteration
|
26 |
+
|
27 |
+
|
28 |
+
"""
|
29 |
+
if config['dir'] is not None:
|
30 |
+
nlp = spacy.load(config['dir'])
|
31 |
+
optimizer = nlp.resume_training()
|
32 |
+
else:
|
33 |
+
nlp = spacy.blank("en") # empty English pipeline
|
34 |
+
nlp.add_pipe("transformer", config=config['config'])
|
35 |
+
for component in components:
|
36 |
+
nlp.add_pipe(component)
|
37 |
+
|
38 |
+
task=nlp.get_pipe(component)
|
39 |
+
if ('ner' in components) and (entities is not None):
|
40 |
+
for label in entities:
|
41 |
+
task.add_label(label)
|
42 |
+
|
43 |
+
nlp.initialize() # XXX don't forget this step!
|
44 |
+
optimizer = nlp.create_optimizer()
|
45 |
+
|
46 |
+
# convert data into training doc
|
47 |
+
train_data_doc = make_training_doc(nlp, train_data)
|
48 |
+
|
49 |
+
all_losses = []
|
50 |
+
for itn in tqdm(range(1,iter+1)):
|
51 |
+
print("Starting iteration " + str(itn))
|
52 |
+
random.shuffle(train_data)
|
53 |
+
losses = {}
|
54 |
+
# compounding(4.0, 32.0, 1.001)
|
55 |
+
batches = minibatch(train_data_doc, size=batch_size)
|
56 |
+
for batch in batches:
|
57 |
+
nlp.update(batch, sgd=optimizer,drop=0.2, losses=losses)
|
58 |
+
|
59 |
+
scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data)
|
60 |
+
print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
|
61 |
+
format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))
|
62 |
+
|
63 |
+
all_losses.append([losses[component] for component in components])
|
64 |
+
|
65 |
+
return nlp, all_losses
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
def train_spacy(model: spacy, train_data: list, components: list, iter: int,
|
70 |
+
batch_size:int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data:list=None)-> spacy:
|
71 |
+
"""
|
72 |
+
Finetune a spacy model or resume training from a fine-tuned model.
|
73 |
+
|
74 |
+
Parameters:
|
75 |
+
model: str, name of spacy model
|
76 |
+
train_data: list, contain training data
|
77 |
+
components: list, list of components to be trained
|
78 |
+
iter: int, number of iterations to train
|
79 |
+
batch_size: int, batch size to be used for training
|
80 |
+
entities: list of entities to be trained on for NER
|
81 |
+
eval_data: list, containing evaluation data
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
nlp : spacy model
|
85 |
+
losses: list of the losses at every iteration
|
86 |
+
|
87 |
+
"""
|
88 |
+
|
89 |
+
# get model and optimizer
|
90 |
+
if model is not None:
|
91 |
+
nlp, optimizer = load_model(model) # load existing spaCy model/ blank models
|
92 |
+
|
93 |
+
# convert data into training doc
|
94 |
+
train_data_doc = make_training_doc(nlp, train_data)
|
95 |
+
|
96 |
+
# create the built-in pipeline components and add them to the pipeline
|
97 |
+
# nlp.create_pipe works for built-ins that are registered with spaCy
|
98 |
+
for component in components:
|
99 |
+
if component not in nlp.pipe_names:
|
100 |
+
ner = nlp.create_pipe(component)
|
101 |
+
nlp.add_pipe(component, last=True)
|
102 |
+
else:
|
103 |
+
ner = nlp.get_pipe(component)
|
104 |
+
|
105 |
+
# add labels if component is NER
|
106 |
+
if (component == 'ner') and (entities is not None):
|
107 |
+
for ent in entities:
|
108 |
+
ner.add_label(ent)
|
109 |
+
|
110 |
+
print(f'Entities in the model are: {nlp.get_pipe("ner").labels}')
|
111 |
+
|
112 |
+
# get names of other pipes to disable them during training
|
113 |
+
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in components]
|
114 |
+
all_losses = []
|
115 |
+
with nlp.disable_pipes(*other_pipes): # only train NER
|
116 |
+
for itn in tqdm(range(1,iter+1)):
|
117 |
+
print("Starting iteration " + str(itn))
|
118 |
+
random.shuffle(train_data)
|
119 |
+
losses = {}
|
120 |
+
batches = minibatch(train_data_doc, size=batch_size)
|
121 |
+
for batch in batches:
|
122 |
+
nlp.update(list(batch),
|
123 |
+
losses=losses,
|
124 |
+
drop=0.1,
|
125 |
+
sgd=optimizer)
|
126 |
+
scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data)
|
127 |
+
print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
|
128 |
+
format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))
|
129 |
+
|
130 |
+
all_losses.append([losses[component] for component in components])
|
131 |
+
|
132 |
+
return nlp, all_losses
|
133 |
+
|
134 |
+
def eval_spacy(model: spacy, data):
|
135 |
+
"""
|
136 |
+
Function to perform evaluation and scoring
|
137 |
+
|
138 |
+
Parameters:
|
139 |
+
model: either a spacy model or spacy transformer
|
140 |
+
data: evaluation data so that scoring can be done
|
141 |
+
|
142 |
+
Returns:
|
143 |
+
score: dict with scores of the model
|
144 |
+
"""
|
145 |
+
scorer = Scorer()
|
146 |
+
examples = []
|
147 |
+
try:
|
148 |
+
# accept spacy format json data
|
149 |
+
for input_, annot in data:
|
150 |
+
doc = model.make_doc(input_)
|
151 |
+
example = Example.from_dict(doc, annot)
|
152 |
+
example.predicted = model(str(example.text))
|
153 |
+
examples.append(example)
|
154 |
+
scores = scorer.score(examples)
|
155 |
+
return scores
|
156 |
+
except TypeError:
|
157 |
+
# accept alternative format json data
|
158 |
+
for row in data:
|
159 |
+
input_, annot = row.values()
|
160 |
+
doc = model.make_doc(input_)
|
161 |
+
example = Example.from_dict(doc, {'entities':annot})
|
162 |
+
example.predicted = model(str(example.text))
|
163 |
+
examples.append(example)
|
164 |
+
scores = scorer.score(examples)
|
165 |
+
return scores
|
166 |
+
except Exception as e: print(e)
|
167 |
+
|
168 |
+
|
st_config.yaml
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#"/app/models/push"
|
2 |
+
model_dir: ""
|
3 |
+
default_models: ['en_Radiology_ner_bc5cdr_md','en_ner_bc5cdr_md']
|
4 |
+
examples:
|
5 |
+
radiology_eval_dataset: "eval_35.csv"
|
6 |
+
clinical note A:
|
7 |
+
"A 74 year old woman with history of hypertension and heart disease who had \
|
8 |
+
been discharged 10 days before knee prosthetic surgery was admitted with 4 day \
|
9 |
+
history of fever dry cough and dyspnoea. She had not left home since discharge \
|
10 |
+
and no family member was affected. Analysis revealed lymphopenia elevation of C \
|
11 |
+
reactive protein and a positive RT PCR. The patient was admitted to the intensive \
|
12 |
+
care unit with a favourable course. Chest X ray on the second day showed diffuse \
|
13 |
+
reticular pattern and increased density in both lungs"
|
14 |
+
Clinical note B:
|
15 |
+
"A 29 year old immunocompromised female patient with a 3 day history of cough and fever. \
|
16 |
+
Past medical history includes severe ulcerative colitis treated with Tofacitinib \
|
17 |
+
The patient was admitted to the hospital ward and discharged one week after admission \
|
18 |
+
with complete recovery. Chest X ray shows increase of parenchymal opacity in right lower lobe"
|
19 |
+
negation:
|
20 |
+
"I am still have cough however there is absence of fever. i do not have any headache nor \
|
21 |
+
stomachache but i do have SARS and COVID 19"
|
22 |
+
|
23 |
+
colors_palette: ["#99A3A4" ,"#bc4ed8","#FF5733","#54e96b","#2ce6f5","#f23fc4","#f3a53a",'#7FB3D5',"#EC7063","#F4D03F"]
|