Spaces:
Sleeping
Sleeping
Commit
·
4f137b2
1
Parent(s):
275476a
"Commit_code"
Browse files- corona_pred.py +68 -0
- corona_train.py +39 -0
- sars_mers_cov_other_train.csv +0 -0
corona_pred.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
4 |
+
from sklearn.naive_bayes import MultinomialNB
|
5 |
+
import pickle
|
6 |
+
import sys
|
7 |
+
|
8 |
+
#print('Reading file...')
|
9 |
+
infile = sys.argv[1]
|
10 |
+
covid19df = pd.read_csv(infile)
|
11 |
+
|
12 |
+
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
|
13 |
+
kmer_size = 6
|
14 |
+
NGram = 4
|
15 |
+
#KFold_val = 10
|
16 |
+
def getKmers(sequence, size=kmer_size):
|
17 |
+
return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
|
18 |
+
|
19 |
+
#print('Creating token using K_Mer...')
|
20 |
+
covid19df['words'] = covid19df.apply(lambda x: getKmers(x['SEQ']), axis=1)
|
21 |
+
|
22 |
+
covid_texts = list(covid19df['words'])
|
23 |
+
#test_labels = np.array(covid19df.pop('CLASS'))
|
24 |
+
|
25 |
+
#print('Converting token to list...')
|
26 |
+
for item in range(len(covid_texts)):
|
27 |
+
covid_texts[item] = ' '.join(covid_texts[item])
|
28 |
+
|
29 |
+
|
30 |
+
#print('Performing Count Vectorization...')
|
31 |
+
cv = pickle.load(open('countVectTrain.pkl', 'rb'))
|
32 |
+
X = cv.transform(covid_texts)
|
33 |
+
|
34 |
+
# load the model from disk
|
35 |
+
filename = 'corona_pred.pkl'
|
36 |
+
model = pickle.load(open(filename, 'rb'))
|
37 |
+
test_pred = model.predict(X)
|
38 |
+
pred_prob = model.predict_proba(X)
|
39 |
+
test_pred_prob = pred_prob.max(1)*100
|
40 |
+
|
41 |
+
covid19df = covid19df.drop('words', axis=1)
|
42 |
+
|
43 |
+
df_test_pred = pd.DataFrame(data=test_pred, index=None, columns=["pred_label"])
|
44 |
+
#df_test_labels = pd.DataFrame(data=test_labels, index=None, columns=["test_label"])
|
45 |
+
df_pred_prob = pd.DataFrame(data=test_pred_prob, index=None, columns=["pred_prob_percentage"])
|
46 |
+
|
47 |
+
covid19df.reset_index(inplace = True, drop = True)
|
48 |
+
df_test_pred.reset_index(inplace = True, drop = True)
|
49 |
+
#df_test_labels.reset_index(inplace = True, drop = True)
|
50 |
+
df_out = pd.concat([covid19df, df_test_pred, df_pred_prob], axis=1)
|
51 |
+
df_out.to_csv('corona_pred_out.csv', index=False)
|
52 |
+
|
53 |
+
#mylist = str("Patient ID,Class <br>")
|
54 |
+
#mylist = str("<table border = 1 ><tr><th>Sequence ID</th><th> Class</th><th> Probability (in %)</th></tr><br>")
|
55 |
+
|
56 |
+
#for row in range(df_out.shape[0]):
|
57 |
+
# mylist = mylist + "<tr><td>" + df_out.iloc[row,0] + "</td>" + "<td> " + str(df_out.iloc[row,2]) + "</td>" + "<td> " + str(df_out.iloc[row,3]) + "</td></tr><br>"
|
58 |
+
# mylist = mylist + df_out.iloc[row,0] + "," + str(df_out.iloc[row,2]) + " <br>"
|
59 |
+
|
60 |
+
#mylist = mylist + "</table>"
|
61 |
+
#print(mylist)
|
62 |
+
df_out = df_out.drop('SEQ', axis=1)
|
63 |
+
df_out_html = df_out.to_html(index = False,justify = 'center')
|
64 |
+
import re
|
65 |
+
df_out_html = re.sub(r'PID', r'Sequence ID', df_out_html)
|
66 |
+
df_out_html = re.sub(r'pred_label', r'Predicted Class', df_out_html)
|
67 |
+
df_out_html = re.sub(r'pred_prob_percentage', r'Probability (in %)', df_out_html)
|
68 |
+
print(df_out_html)
|
corona_train.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
4 |
+
from sklearn.naive_bayes import MultinomialNB
|
5 |
+
import pickle
|
6 |
+
|
7 |
+
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
|
8 |
+
kmer_size = 6
|
9 |
+
NGram = 4
|
10 |
+
#KFold_val = 10
|
11 |
+
def getKmers(sequence, size=kmer_size):
|
12 |
+
return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
|
13 |
+
|
14 |
+
print('Reading file...')
|
15 |
+
#covid19df= pd.read_csv('SARS_MERS_COV_train.csv')
|
16 |
+
covid19df= pd.read_csv('sars_mers_cov_other_train.csv')
|
17 |
+
|
18 |
+
print('Creating token using K_Mer...')
|
19 |
+
covid19df['words'] = covid19df.apply(lambda x: getKmers(x['SEQ']), axis=1)
|
20 |
+
covid19df = covid19df.drop('SEQ', axis=1)
|
21 |
+
covid_texts = list(covid19df['words'])
|
22 |
+
|
23 |
+
print('Converting token to list...')
|
24 |
+
for item in range(len(covid_texts)):
|
25 |
+
covid_texts[item] = ' '.join(covid_texts[item])
|
26 |
+
y_data = covid19df["CLASS"].values
|
27 |
+
|
28 |
+
print('Performing Count Vectorization...')
|
29 |
+
cv = CountVectorizer(ngram_range=(NGram,NGram))
|
30 |
+
X = cv.fit_transform(covid_texts)
|
31 |
+
pickle.dump(cv, open('countVectTrain.pkl', 'wb'))
|
32 |
+
|
33 |
+
print('Creating Classifiers...')
|
34 |
+
NB_classifier = MultinomialNB(alpha=0.1)
|
35 |
+
|
36 |
+
NB_classifier.fit(X, y_data)
|
37 |
+
# save the model to disk
|
38 |
+
filename = 'corona_pred.pkl'
|
39 |
+
pickle.dump(NB_classifier, open(filename, 'wb'))
|
sars_mers_cov_other_train.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|