Spaces:
Build error
Build error
model
Browse files
Word2vec/data_problem_corpus/problem_corpus_sample_cleaned.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:194deaf2b057e3eb519ffe122c6b7f79544d6b2a1de339555e410b029174b0b6
|
| 3 |
+
size 234347529
|
Word2vec/run.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
# @File : test_sentence_similarity.py
|
| 4 |
+
# @Author: nixin
|
| 5 |
+
# @Date : 2019-03-06
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
from scipy import spatial
|
| 9 |
+
from gensim.models import word2vec
|
| 10 |
+
import pandas as pd
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# load the trained word vector model
|
| 15 |
+
model = word2vec.Word2Vec.load('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/trained_word2vec.model')
|
| 16 |
+
index2word_set = set(model.wv.index2word)
|
| 17 |
+
|
| 18 |
+
def avg_feature_vector(sentence, model, num_features, index2word_set):
|
| 19 |
+
words = sentence.split()
|
| 20 |
+
feature_vec = np.zeros((num_features, ), dtype='float32')
|
| 21 |
+
n_words = 0
|
| 22 |
+
for word in words:
|
| 23 |
+
if word in index2word_set:
|
| 24 |
+
n_words += 1
|
| 25 |
+
feature_vec = np.add(feature_vec, model[word])
|
| 26 |
+
if (n_words > 0):
|
| 27 |
+
feature_vec = np.divide(feature_vec, n_words)
|
| 28 |
+
return feature_vec
|
| 29 |
+
|
| 30 |
+
#read problem file
|
| 31 |
+
problem_corpus = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/data_problem_corpus/problem_corpus_sample_cleaned.csv')
|
| 32 |
+
problem_corpus = problem_corpus.head(100)
|
| 33 |
+
|
| 34 |
+
target_problem = 'strategic cleavage of such a target rna will destroy its ability to direct synthesis of an encoded protein'
|
| 35 |
+
target_domain = 'A'
|
| 36 |
+
|
| 37 |
+
# remove the same domain's problems
|
| 38 |
+
problem_corpus = problem_corpus[problem_corpus.Domain != 'A']
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# choose the time range
|
| 42 |
+
problem_corpus = problem_corpus[problem_corpus['publication_year'].between(2015, 2017)]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
value=[]
|
| 46 |
+
for each_problem in problem_corpus['First part Contradiction']:
|
| 47 |
+
s1_afv = avg_feature_vector(target_problem, model=model, num_features=100, index2word_set=index2word_set)
|
| 48 |
+
s2_afv = avg_feature_vector(each_problem, model=model, num_features=100, index2word_set=index2word_set)
|
| 49 |
+
sim_value = format( 1 - spatial.distance.cosine(s1_afv, s2_afv), '.2f')
|
| 50 |
+
value.append(sim_value)
|
| 51 |
+
|
| 52 |
+
problem_corpus[['similarity_value', 'target_problem']] = value, target_problem
|
| 53 |
+
|
| 54 |
+
print(problem_corpus)
|
| 55 |
+
|
| 56 |
+
# set similarity threshold
|
| 57 |
+
problem_corpus_final = problem_corpus[problem_corpus.similarity_value>= '0.8']
|
| 58 |
+
# print(problem_corpus.columns())
|
| 59 |
+
|
| 60 |
+
problem_corpus_final.to_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/simialrity_result/test.csv', index=False)
|
| 61 |
+
print(problem_corpus_final)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
|
Word2vec/simialrity_result/test.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f100a1f9f61956bb4e97d177bc48b581c1ab4a925215c43d1cf9f8e590070774
|
| 3 |
+
size 2601
|
Word2vec/trained_word2vec.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3546e4a57f7c76e9272566c43311dcebe354a3a968ea70b3f3a3b6d55c8f5977
|
| 3 |
+
size 147031792
|