tien314 commited on
Commit
28621b1
·
verified ·
1 Parent(s): 0cc0966

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -0
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import bm25s
3
+ from operator import itemgetter
4
+ import os
5
+ import re
6
+
7
+
8
+ @st.cache_data
9
+ def load_data():
10
+ df = pd.read_csv("/kaggle/input/cleanlist/cleaned_list.csv",header = None)
11
+ df.columns = ['document']
12
+ corpus = [doc for doc in df['document'].to_list()]
13
+
14
+ retriever = bm25s.BM25(corpus=corpus)
15
+ retriever.index(bm25s.tokenize(corpus)
16
+
17
+ return retriever
18
+
19
+ def extract_hscode(text):
20
+ match = re.search(r'hs_code:\s*(\d+)', text)
21
+ if match:
22
+ return match.group(1)
23
+ return None
24
+
25
+ df2 = pd.read_csv("/kaggle/input/hscode-miss/hscode_main.csv")
26
+ new_col = [len(str(code))for code in df2['hs_code'].to_list()]
27
+ df2['len'] = new_col
28
+
29
+ new_hscode = [str(code) for code in df2['hs_code']]
30
+
31
+ for i in range(len(new_col)):
32
+ if new_col[i]==5:
33
+ new_hscode[i] = '0'+ new_hscode[i]
34
+ df2['hs_code'] = new_hscode
35
+ df2=df2.drop(columns='len')
36
+
37
+ if 'retriever' not in st.session_state:
38
+ st.session_state.retriever = None
39
+
40
+ if st.session_state.retriever is None:
41
+ st.session_state.retriever = load data()
42
+
43
+
44
+ sentence = st.text_input("please enter description:")
45
+
46
+ if sentence !='':
47
+ results,_ = st.session_state.retriever.retrieve(bm25s.tokenize(query), k=5)
48
+ doc = [d for d in results]
49
+ hscodes = [extract_hscode(item) for item in doc[0]]
50
+ for code in hscodes:
51
+ filter_df = df2[df2['hs_code']==code]
52
+ answer = filter_df['full_description'].iloc[0]
53
+ st.write("Hscode:",code)
54
+ st.write("answer:",answer)