ernestchu commited on
Commit
da557ef
·
1 Parent(s): 1da579e
Files changed (3) hide show
  1. cacm.raw +0 -0
  2. common_words +429 -0
  3. hw2_part3_web.py +515 -0
cacm.raw ADDED
The diff for this file is too large to render. See raw diff
 
common_words ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a
2
+ about
3
+ above
4
+ accordingly
5
+ across
6
+ after
7
+ afterwards
8
+ again
9
+ against
10
+ all
11
+ almost
12
+ alone
13
+ along
14
+ already
15
+ also
16
+ although
17
+ always
18
+ am
19
+ among
20
+ amongst
21
+ an
22
+ and
23
+ another
24
+ any
25
+ anybody
26
+ anyhow
27
+ anyone
28
+ anything
29
+ anywhere
30
+ apart
31
+ are
32
+ around
33
+ as
34
+ aside
35
+ at
36
+ away
37
+ awfully
38
+ b
39
+ be
40
+ became
41
+ because
42
+ become
43
+ becomes
44
+ becoming
45
+ been
46
+ before
47
+ beforehand
48
+ behind
49
+ being
50
+ below
51
+ beside
52
+ besides
53
+ best
54
+ better
55
+ between
56
+ beyond
57
+ both
58
+ brief
59
+ but
60
+ by
61
+ c
62
+ can
63
+ cannot
64
+ cant
65
+ certain
66
+ co
67
+ consequently
68
+ could
69
+ d
70
+ did
71
+ do
72
+ does
73
+ doing
74
+ done
75
+ down
76
+ downwards
77
+ during
78
+ e
79
+ each
80
+ eg
81
+ eight
82
+ either
83
+ else
84
+ elsewhere
85
+ enough
86
+ et
87
+ etc
88
+ even
89
+ ever
90
+ every
91
+ everybody
92
+ everyone
93
+ everything
94
+ everywhere
95
+ ex
96
+ except
97
+ f
98
+ far
99
+ few
100
+ fifth
101
+ first
102
+ five
103
+ for
104
+ former
105
+ formerly
106
+ forth
107
+ four
108
+ from
109
+ further
110
+ furthermore
111
+ g
112
+ get
113
+ gets
114
+ go
115
+ gone
116
+ got
117
+ h
118
+ had
119
+ hardly
120
+ has
121
+ have
122
+ having
123
+ he
124
+ hence
125
+ her
126
+ here
127
+ hereafter
128
+ hereby
129
+ herein
130
+ hereupon
131
+ hers
132
+ herself
133
+ him
134
+ himself
135
+ his
136
+ hither
137
+ how
138
+ howbeit
139
+ however
140
+ i
141
+ ie
142
+ if
143
+ immediate
144
+ in
145
+ inasmuch
146
+ inc
147
+ indeed
148
+ inner
149
+ insofar
150
+ instead
151
+ into
152
+ inward
153
+ is
154
+ it
155
+ its
156
+ itself
157
+ j
158
+ just
159
+ k
160
+ keep
161
+ kept
162
+ l
163
+ last
164
+ latter
165
+ latterly
166
+ least
167
+ less
168
+ lest
169
+ like
170
+ little
171
+ ltd
172
+ m
173
+ many
174
+ may
175
+ me
176
+ meanwhile
177
+ might
178
+ more
179
+ moreover
180
+ most
181
+ mostly
182
+ much
183
+ must
184
+ my
185
+ myself
186
+ n
187
+ namely
188
+ near
189
+ neither
190
+ never
191
+ nevertheless
192
+ new
193
+ next
194
+ nine
195
+ no
196
+ nobody
197
+ none
198
+ noone
199
+ nor
200
+ not
201
+ nothing
202
+ novel
203
+ now
204
+ nowhere
205
+ o
206
+ of
207
+ off
208
+ often
209
+ oh
210
+ old
211
+ on
212
+ once
213
+ one
214
+ ones
215
+ only
216
+ onto
217
+ or
218
+ other
219
+ others
220
+ otherwise
221
+ ought
222
+ our
223
+ ours
224
+ ourselves
225
+ out
226
+ outside
227
+ over
228
+ overall
229
+ own
230
+ p
231
+ particular
232
+ particularly
233
+ per
234
+ perhaps
235
+ please
236
+ plus
237
+ probably
238
+ q
239
+ que
240
+ quite
241
+ r
242
+ rather
243
+ really
244
+ relatively
245
+ respectively
246
+ right
247
+ s
248
+ said
249
+ same
250
+ second
251
+ secondly
252
+ see
253
+ seem
254
+ seemed
255
+ seeming
256
+ seems
257
+ self
258
+ selves
259
+ sensible
260
+ serious
261
+ seven
262
+ several
263
+ shall
264
+ she
265
+ should
266
+ since
267
+ six
268
+ so
269
+ some
270
+ somebody
271
+ somehow
272
+ someone
273
+ something
274
+ sometime
275
+ sometimes
276
+ somewhat
277
+ somewhere
278
+ still
279
+ sub
280
+ such
281
+ sup
282
+ t
283
+ than
284
+ that
285
+ the
286
+ their
287
+ theirs
288
+ them
289
+ themselves
290
+ then
291
+ thence
292
+ there
293
+ thereafter
294
+ thereby
295
+ therefore
296
+ therein
297
+ thereupon
298
+ these
299
+ they
300
+ third
301
+ this
302
+ thorough
303
+ thoroughly
304
+ those
305
+ though
306
+ three
307
+ through
308
+ throughout
309
+ thru
310
+ thus
311
+ to
312
+ together
313
+ too
314
+ toward
315
+ towards
316
+ twice
317
+ two
318
+ u
319
+ under
320
+ until
321
+ unto
322
+ up
323
+ upon
324
+ us
325
+ v
326
+ various
327
+ very
328
+ via
329
+ vs
330
+ viz
331
+ w
332
+ was
333
+ we
334
+ well
335
+ went
336
+ were
337
+ what
338
+ whatever
339
+ when
340
+ whence
341
+ whenever
342
+ where
343
+ whereafter
344
+ whereas
345
+ whereby
346
+ wherein
347
+ whereupon
348
+ wherever
349
+ whether
350
+ which
351
+ while
352
+ whither
353
+ who
354
+ whoever
355
+ whole
356
+ whom
357
+ whose
358
+ why
359
+ will
360
+ with
361
+ within
362
+ without
363
+ would
364
+ x
365
+ y
366
+ yet
367
+ you
368
+ your
369
+ yours
370
+ yourself
371
+ yourselves
372
+ z
373
+ zero
374
+ /*
375
+ manual
376
+ unix
377
+ programmer's
378
+ file
379
+ files
380
+ used
381
+ name
382
+ specified
383
+ value
384
+ given
385
+ return
386
+ use
387
+ following
388
+ current
389
+ using
390
+ normally
391
+ returns
392
+ returned
393
+ causes
394
+ described
395
+ contains
396
+ example
397
+ possible
398
+ useful
399
+ available
400
+ associated
401
+ would
402
+ cause
403
+ provides
404
+ taken
405
+ unless
406
+ sent
407
+ followed
408
+ indicates
409
+ currently
410
+ necessary
411
+ specify
412
+ contain
413
+ indicate
414
+ appear
415
+ different
416
+ indicated
417
+ containing
418
+ gives
419
+ placed
420
+ uses
421
+ appropriate
422
+ automatically
423
+ ignored
424
+ changes
425
+ way
426
+ usually
427
+ allows
428
+ corresponding
429
+ specifying
hw2_part3_web.py ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import re
3
+ from collections import Counter, defaultdict
4
+ from typing import Dict, List, NamedTuple
5
+ import argparse
6
+ import sys
7
+ import time
8
+ import threading
9
+ import itertools
10
+
11
+ import gradio as gr
12
+ import numpy as np
13
+ from numpy.linalg import norm
14
+ import nltk
15
+ from nltk.stem.snowball import SnowballStemmer
16
+ from nltk.tokenize import word_tokenize
17
+ # nltk.download('punkt_tab')
18
+
19
+ def spinner(stop_event):
20
+ spinner_chars = itertools.cycle(['-', '\\', '|', '/'])
21
+ sys.stdout.write(f'{next(spinner_chars)}')
22
+ sys.stdout.flush()
23
+ time.sleep(0.1)
24
+ while not stop_event.is_set():
25
+ sys.stdout.write(f'\b{next(spinner_chars)}')
26
+ sys.stdout.flush()
27
+ time.sleep(0.1)
28
+ print(f'\b \n')
29
+
30
+ # Create a threading event to stop the spinner
31
+ stop_event = threading.Event()
32
+
33
+ ### File IO and processing
34
+
35
+ class Document(NamedTuple):
36
+ doc_id: int
37
+ author: List[str]
38
+ title: List[str]
39
+ keyword: List[str]
40
+ abstract: List[str]
41
+
42
+ def sections(self):
43
+ return [self.author, self.title, self.keyword, self.abstract]
44
+
45
+ def __repr__(self):
46
+ return (f"doc_id: {self.doc_id}\n" +
47
+ f" author: {self.author}\n" +
48
+ f" title: {self.title}\n" +
49
+ f" keyword: {self.keyword}\n" +
50
+ f" abstract: {self.abstract}")
51
+
52
+
53
+ def read_stopwords(file):
54
+ with open(file) as f:
55
+ return set([x.strip() for x in f.readlines()])
56
+
57
+ stopwords = read_stopwords('common_words')
58
+
59
+ stemmer = SnowballStemmer('english')
60
+
61
+ def read_rels(file):
62
+ '''
63
+ Reads the file of related documents and returns a dictionary of query id -> list of related documents
64
+ '''
65
+ rels = {}
66
+ with open(file) as f:
67
+ for line in f:
68
+ qid, rel = line.strip().split()
69
+ qid = int(qid)
70
+ rel = int(rel)
71
+ if qid not in rels:
72
+ rels[qid] = []
73
+ rels[qid].append(rel)
74
+ return rels
75
+
76
+ def read_docs(file):
77
+ '''
78
+ Reads the corpus into a list of Documents
79
+ '''
80
+ docs = [defaultdict(list)] # empty 0 index
81
+ category = ''
82
+ with open(file) as f:
83
+ i = 0
84
+ for line in f:
85
+ line = line.strip()
86
+ if line.startswith('.I'):
87
+ i = int(line[3:])
88
+ docs.append(defaultdict(list))
89
+ elif re.match(r'\.\w', line):
90
+ category = line[1]
91
+ elif line != '':
92
+ for word in word_tokenize(line):
93
+ docs[i][category].append(word.lower())
94
+
95
+ return [Document(i + 1, d['A'], d['T'], d['K'], d['W'])
96
+ for i, d in enumerate(docs[1:])]
97
+
98
+ def read_docs_for_presentation(file):
99
+ docs = [defaultdict(str)] # empty 0 index
100
+ category = ''
101
+ with open(file) as f:
102
+ i = 0
103
+ for line in f:
104
+ line = line.strip()
105
+ if line.startswith('.I'):
106
+ i = int(line[3:])
107
+ docs.append(defaultdict(str))
108
+ elif re.match(r'\.\w', line):
109
+ category = line[1]
110
+ elif line != '':
111
+ if docs[i][category] == '':
112
+ docs[i][category] = line
113
+ else:
114
+ if docs[i][category][-1] == '.':
115
+ docs[i][category] = f'{docs[i][category]} {line}'
116
+ else:
117
+ docs[i][category] = f'{docs[i][category]}. {line}'
118
+
119
+ return [Document(i + 1, d['A'], d['T'], d['K'], d['W'])
120
+ for i, d in enumerate(docs[1:])]
121
+
122
+ def stem_doc(doc: Document):
123
+ return Document(doc.doc_id, *[[stemmer.stem(word) for word in sec]
124
+ for sec in doc.sections()])
125
+
126
+ def stem_docs(docs: List[Document]):
127
+ return [stem_doc(doc) for doc in docs]
128
+
129
+ def remove_stopwords_doc(doc: Document):
130
+ return Document(doc.doc_id, *[[word for word in sec if word not in stopwords]
131
+ for sec in doc.sections()])
132
+
133
+ def remove_stopwords(docs: List[Document]):
134
+ return [remove_stopwords_doc(doc) for doc in docs]
135
+
136
+
137
+
138
+ ### Term-Document Matrix
139
+
140
+ class TermWeights(NamedTuple):
141
+ author: float
142
+ title: float
143
+ keyword: float
144
+ abstract: float
145
+
146
+ def compute_doc_freqs(docs: List[Document]):
147
+ '''
148
+ Computes document frequency, i.e. how many documents contain a specific word
149
+ '''
150
+ freq = Counter()
151
+ for doc in docs:
152
+ words = set()
153
+ for sec in doc.sections():
154
+ for word in sec:
155
+ words.add(word)
156
+ for word in words:
157
+ freq[word] += 1
158
+ return freq
159
+
160
+ def compute_tf(doc: Document, doc_freqs: Dict[str, int], weights: list):
161
+ vec = defaultdict(float)
162
+ for word in doc.author:
163
+ vec[word] += weights.author
164
+ for word in doc.keyword:
165
+ vec[word] += weights.keyword
166
+ for word in doc.title:
167
+ vec[word] += weights.title
168
+ for word in doc.abstract:
169
+ vec[word] += weights.abstract
170
+ return dict(vec) # convert back to a regular dict
171
+
172
+ def compute_tfidf(doc, doc_freqs, weights):
173
+ tfidf = defaultdict(float)
174
+ tf = compute_tf(doc, doc_freqs, weights)
175
+ N = 3204
176
+ for word in tf:
177
+ idf = np.log((1+N) / (1+doc_freqs[word]))
178
+ tfidf[word] = tf[word] * idf
179
+ return dict(tfidf) # convert back to a regular dict
180
+
181
+ def compute_boolean(doc, doc_freqs, weights):
182
+ vec = defaultdict(float)
183
+ for word in doc.author:
184
+ vec[word] = weights.author
185
+ for word in doc.keyword:
186
+ vec[word] = weights.keyword
187
+ for word in doc.title:
188
+ vec[word] = weights.title
189
+ for word in doc.abstract:
190
+ vec[word] = weights.abstract
191
+ return dict(vec) # convert back to a regular dict
192
+
193
+
194
+
195
+ ### Vector Similarity
196
+
197
+ def dictdot(x: Dict[str, float], y: Dict[str, float]):
198
+ '''
199
+ Computes the dot product of vectors x and y, represented as sparse dictionaries.
200
+ '''
201
+ keys = list(x.keys()) if len(x) < len(y) else list(y.keys())
202
+ return sum(x.get(key, 0) * y.get(key, 0) for key in keys)
203
+
204
+ def cosine_sim_dict(x, y):
205
+ '''
206
+ Computes the cosine similarity between two sparse term vectors represented as dictionaries.
207
+ '''
208
+ num = dictdot(x, y)
209
+ if num == 0:
210
+ return 0
211
+ return num / (norm(list(x.values())) * norm(list(y.values())))
212
+
213
+ def cosine_sim(x, y):
214
+ if isinstance(x, dict):
215
+ return cosine_sim_dict(x, y)
216
+ return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
217
+
218
+ def dice_sim(x, y):
219
+ raise NotImplementedError
220
+ num = 2 * dictdot(x, y)
221
+ if num == 0:
222
+ return 0
223
+ denom = sum(list(x.values())) + sum(list(y.values()))
224
+ ret = num / denom if denom != 0 else 0
225
+ # if ret > 1 or ret < 0:
226
+ # breakpoint()
227
+ return ret
228
+
229
+ def jaccard_sim(x, y):
230
+ raise NotImplementedError
231
+ num = dictdot(x, y)
232
+ if num == 0:
233
+ return 0
234
+ # denom = norm(list(x.values())) ** 2 + norm(list(y.values())) ** 2 - num
235
+ denom = sum(list(x.values())) + sum(list(y.values())) - num
236
+ ret = num / denom if denom != 0 else 0
237
+ # if ret > 1 or ret < 0:
238
+ # breakpoint()
239
+ return ret
240
+
241
+ def overlap_sim(x, y):
242
+ raise NotImplementedError
243
+ num = dictdot(x, y)
244
+ if num == 0:
245
+ return 0
246
+ # denom = min(norm(list(x.values())) ** 2, norm(list(y.values())) ** 2)
247
+ denom = min(sum(list(x.values())), sum(list(y.values())))
248
+ ret = num / denom if denom != 0 else 0
249
+ # if ret > 1 or ret < 0:
250
+ # breakpoint()
251
+ return ret
252
+
253
+
254
+ ### Precision/Recall
255
+
256
+ def interpolate(x1, y1, x2, y2, x):
257
+ m = (y2 - y1) / (x2 - x1)
258
+ b = y1 - m * x1
259
+ return m * x + b
260
+
261
+ def precision_at(recall: float, results: List[int], relevant: List[int]) -> float:
262
+ '''
263
+ This function should compute the precision at the specified recall level.
264
+ If the recall level is in between two points, you should do a linear interpolation
265
+ between the two closest points. For example, if you have 4 results
266
+ (recall 0.25, 0.5, 0.75, and 1.0), and you need to compute recall @ 0.6, then do something like
267
+
268
+ interpolate(0.5, prec @ 0.5, 0.75, prec @ 0.75, 0.6)
269
+
270
+ Note that there is implicitly a point (recall=0, precision=1).
271
+
272
+ `results` is a sorted list of document ids
273
+ `relevant` is a list of relevant documents
274
+ '''
275
+ assert recall >= 0 and recall <= 1, f'Invalid recall: {recall}'
276
+ recalls = [0]
277
+ precisions = [1]
278
+ recalls += [(i+1) / len(relevant) for i in range(len(relevant))]
279
+ ranks = sorted([results.index(rel)+1 for rel in relevant])
280
+ precisions += [(i+1) / rk for i, rk in enumerate(ranks)]
281
+
282
+ idx = 0
283
+ for i, rec in enumerate(recalls):
284
+ if recall > rec:
285
+ idx = i
286
+ r1 = recalls[idx]
287
+ r2 = recalls[idx+1]
288
+
289
+ val = interpolate(r1, precisions[idx], r2, precisions[idx+1], recall)
290
+ return val
291
+
292
+ def mean_precision1(results, relevant):
293
+ return (precision_at(0.25, results, relevant) +
294
+ precision_at(0.5, results, relevant) +
295
+ precision_at(0.75, results, relevant)) / 3
296
+
297
+ def mean_precision2(results, relevant):
298
+ return sum([precision_at((i+1)/10, results, relevant) for i in range(10)]) / 10
299
+
300
+ def norm_recall(results, relevant):
301
+ N = len(results)
302
+ num_rel = len(relevant)
303
+ ranks = [results.index(rel) + 1 for rel in relevant]
304
+ return 1 - (sum([ranks[i] for i in range(num_rel)]) - sum([i+1 for i in range(num_rel)])) / num_rel / (N - num_rel)
305
+
306
+ def norm_precision(results, relevant):
307
+ N = len(results)
308
+ num_rel = len(relevant)
309
+ ranks = [results.index(rel) + 1 for rel in relevant]
310
+ denum = N * np.log(N) - (N - num_rel) * np.log(N - num_rel) - num_rel * np.log(num_rel)
311
+ return 1 - (sum([np.log(ranks[i]) for i in range(num_rel)]) - sum([np.log(i+1) for i in range(num_rel)])) / denum
312
+
313
+
314
+ ### Extensions
315
+
316
+ # TODO: put any extensions here
317
+
318
+ def to_full_matrix(doc_vectors):
319
+ '''
320
+ Converts a list of sparse term vectors into a full term-document matrix.
321
+ '''
322
+ # a set of words in all documents
323
+ words = set()
324
+ for doc_vec in doc_vectors:
325
+ words.update(doc_vec.keys())
326
+ words = list(words)
327
+
328
+ matrix = np.zeros((len(doc_vectors), len(words)))
329
+ for i, doc_vec in enumerate(doc_vectors):
330
+ for word, val in doc_vec.items():
331
+ matrix[i, words.index(word)] = val
332
+ return matrix, words
333
+
334
+ def sparse_svd(doc_vectors, rank):
335
+ doc_matrix, words = to_full_matrix(doc_vectors)
336
+ _, _, Vt = np.linalg.svd(doc_matrix)
337
+ Vt_k = Vt[:rank, :]
338
+
339
+ doc_matrix = doc_matrix @ Vt_k.T
340
+
341
+ def project_fn(input_vector):
342
+ output_vector = np.zeros(len(words))
343
+ for word, val in input_vector.items():
344
+ if word in words:
345
+ output_vector[words.index(word)] = val
346
+ return output_vector @ Vt_k.T
347
+
348
+ return [vec for vec in doc_matrix], project_fn
349
+
350
+ def formated_output_for_doc(doc):
351
+ res = ''
352
+ res = res + '# ' + ' '.join(doc.title) + '\n'
353
+ if doc.author:
354
+ res = res + ' by ' + ' '.join(doc.author) + '\n'
355
+ if doc.abstract:
356
+ res = res + ' ' + ' '.join(doc.abstract) + '\n'
357
+ return res
358
+
359
+ ### Search
360
+
361
+ def setup():
362
+ # args = parse_args()
363
+ args = argparse.Namespace(use_svd=True, svd_rank=3000)
364
+
365
+ print('Starting search engine ', end='')
366
+ if args.use_svd:
367
+ print('(with SVD) ', end='')
368
+
369
+ # Start the spinner in a separate thread
370
+ spinner_thread = threading.Thread(target=spinner, args=(stop_event,))
371
+ spinner_thread.start()
372
+
373
+
374
+ docs = read_docs('cacm.raw')
375
+ # queries = read_docs('query.raw')
376
+ # rels = read_rels('query.rels')
377
+ stopwords = read_stopwords('common_words')
378
+
379
+ term_func = compute_tfidf
380
+ sim_func = cosine_sim
381
+ svd_rank = args.svd_rank
382
+
383
+ # for svd_rank, term, stem, removestop, sim, term_weights in itertools.product(*permutations):
384
+ stem = True
385
+ removestop = True
386
+ term_weights = TermWeights(author=3, title=3, keyword=4, abstract=1)
387
+
388
+ processed_docs = process_docs(docs, stem, removestop, stopwords)
389
+ doc_freqs = compute_doc_freqs(processed_docs)
390
+ doc_vectors = [term_func(doc, doc_freqs, term_weights) for doc in processed_docs]
391
+ if args.use_svd:
392
+ doc_vectors, svd_project_fn = sparse_svd(doc_vectors, svd_rank)
393
+
394
+ # Stop the spinner
395
+ stop_event.set()
396
+ spinner_thread.join()
397
+
398
+ def search_query(query):
399
+ tmp_query_file = '/tmp/irhw2'
400
+ with open(tmp_query_file, 'w') as f:
401
+ print(f"""
402
+
403
+ .I 1
404
+ .W
405
+ {query}
406
+ """, file=f)
407
+ queries = read_docs(tmp_query_file)
408
+ processed_queries = process_docs(queries, stem, removestop, stopwords)
409
+
410
+ query = processed_queries[0]
411
+ query_vec = term_func(query, doc_freqs, term_weights)
412
+ if args.use_svd:
413
+ query_vec = svd_project_fn(query_vec)
414
+ results = search(doc_vectors, query_vec, sim_func)
415
+ return results
416
+
417
+ docs_present = read_docs_for_presentation('cacm.raw')
418
+
419
+ return search_query, docs_present
420
+
421
+ def process_docs(docs, stem, removestop, stopwords):
422
+ processed_docs = docs
423
+ if removestop:
424
+ processed_docs = remove_stopwords(processed_docs)
425
+ if stem:
426
+ processed_docs = stem_docs(processed_docs)
427
+ return processed_docs
428
+
429
+ def process_docs_and_queries(docs, queries, stem, removestop, stopwords):
430
+ processed_docs = docs
431
+ processed_queries = queries
432
+ if removestop:
433
+ processed_docs = remove_stopwords(processed_docs)
434
+ processed_queries = remove_stopwords(processed_queries)
435
+ if stem:
436
+ processed_docs = stem_docs(processed_docs)
437
+ processed_queries = stem_docs(processed_queries)
438
+ return processed_docs, processed_queries
439
+
440
+
441
+ def search(doc_vectors, query_vec, sim):
442
+ results_with_score = [(doc_id + 1, sim(query_vec, doc_vec))
443
+ for doc_id, doc_vec in enumerate(doc_vectors)]
444
+ results_with_score = sorted(results_with_score, key=lambda x: -x[1])
445
+ return results_with_score
446
+ results = [x[0] for x in results_with_score]
447
+ return results
448
+
449
+
450
+ def search_debug(docs, query, relevant, doc_vectors, query_vec, sim):
451
+ results_with_score = [(doc_id + 1, sim(query_vec, doc_vec))
452
+ for doc_id, doc_vec in enumerate(doc_vectors)]
453
+ results_with_score = sorted(results_with_score, key=lambda x: -x[1])
454
+ results = [x[0] for x in results_with_score]
455
+
456
+ print('Query:', query)
457
+ print('Relevant docs: ', relevant)
458
+ print()
459
+ for doc_id, score in results_with_score[:10]:
460
+ print('Score:', score)
461
+ print(docs[doc_id - 1])
462
+ print()
463
+
464
+ def parse_args():
465
+ arg_parser = argparse.ArgumentParser()
466
+ arg_parser.add_argument('--use_svd', action='store_true')
467
+ arg_parser.add_argument('--svd_rank', type=int, default=3000)
468
+ return arg_parser.parse_args()
469
+
470
+ search_query, docs = setup()
471
+
472
+ with gr.Blocks() as demo:
473
+ gr.Markdown("# Search Engine")
474
+ with gr.Row():
475
+ query = gr.Textbox(label="Query", autofocus=True)
476
+
477
+ # with gr.Row():
478
+ # search_results = gr.Textbox(lines=5, label="Results")
479
+ #
480
+
481
+ num_results_step = 5
482
+ num_results = gr.State(num_results_step)
483
+
484
+ @gr.render(inputs=[query, num_results], triggers=[query.submit, num_results.change])
485
+ def render_results(query, num_res):
486
+ if query.strip() != '':
487
+ results = search_query(query)[:num_res]
488
+ for doc_id, score in results:
489
+ doc = docs[doc_id - 1]
490
+ html = f"""
491
+ <div style="margin: 30px 0">
492
+ <div style="display: flex; align-items: center; gap: 10px;">
493
+ <img src="https://www.cs.jhu.edu/favicon.ico" width="25px">
494
+ <div style="color: #202124; font-size: 14px;">{doc.author if doc.author.strip() else 'No author provided'}</div>
495
+ </div>
496
+ <div style="font-size: 20px; color: rgb(26, 13, 171); cursor: pointer; margin: 10px 0" onclick="alert('Just a mockup search engine, lol.')">{doc.title}</div>
497
+ <div style="color: rgb(71, 71, 71);">{doc.abstract if doc.abstract.strip() else 'No abstract provided'}<br>Relevance score: {score:.3f}</div>
498
+ </div>
499
+ """
500
+ gr.HTML(html)
501
+ gr.HTML('<div style="margin: 50px"></div>')
502
+ # more_btn = gr.HTML('''
503
+ # <div style="display: flex;justify-content: center; margin: 40px">
504
+ # <div style="color: rgb(26, 13, 171); font-size: 18px; font-weight: 600; cursor: pointer">More like this</div>
505
+ # </div>''')
506
+ more_btn = gr.Button('More like this')
507
+ more_btn.click(lambda x: x + num_results_step, num_results, num_results)
508
+
509
+ query.change(lambda _: num_results_step, num_results, num_results)
510
+
511
+ if __name__ == '__main__':
512
+ demo.launch(
513
+ # server_name="0.0.0.0",
514
+ server_port=7861,
515
+ )