kkngan commited on
Commit
7e0431e
·
verified ·
1 Parent(s): f1f604e

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +77 -0
  3. bert-itserviceclassification +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ bert-itserviceclassification filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_mic_recorder import mic_recorder
3
+ from transformers import pipeline
4
+ import torch
5
+ from transformers import BertTokenizer, BertForSequenceClassification
6
+
7
+ def callback():
8
+ if st.session_state.my_recorder_output:
9
+ audio_bytes = st.session_state.my_recorder_output['bytes']
10
+ st.audio(audio_bytes)
11
+
12
+ def transcribe(upload):
13
+ pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base")
14
+ result = pipe(upload, generate_kwargs={'task': 'transcribe'})
15
+ print(result['text'])
16
+ return result['text']
17
+
18
+ def encode(docs, tokenizer):
19
+ '''
20
+ This function takes list of texts and returns input_ids and attention_mask of texts
21
+ '''
22
+ encoded_dict = tokenizer.batch_encode_plus(docs, add_special_tokens=True, max_length=128, padding='max_length',
23
+ return_attention_mask=True, truncation=True, return_tensors='pt')
24
+ input_ids = encoded_dict['input_ids']
25
+ attention_masks = encoded_dict['attention_mask']
26
+ return input_ids, attention_masks
27
+
28
+
29
+ def load_model():
30
+ CUSTOMMODEL_PATH = "./bert-itserviceclassification"
31
+ PRETRAINED_LM = "bert-base-uncased"
32
+ tokenizer = BertTokenizer.from_pretrained(PRETRAINED_LM, do_lower_case=True)
33
+ model = BertForSequenceClassification.from_pretrained(PRETRAINED_LM,
34
+ num_labels=8,
35
+ output_attentions=False,
36
+ output_hidden_states=False)
37
+ model.load_state_dict(torch.load(CUSTOMMODEL_PATH))
38
+ return model, tokenizer
39
+
40
+
41
+ def predict(text, model, tokenizer):
42
+ lookup_key ={0: 'Hardware',
43
+ 1: 'Access',
44
+ 2: 'Miscellaneous',
45
+ 3: 'HR Support',
46
+ 4: 'Purchase',
47
+ 5: 'Administrative rights',
48
+ 6: 'Storage',
49
+ 7: 'Internal Project'}
50
+ with torch.no_grad():
51
+ input_ids, att_mask = encode([text], tokenizer)
52
+ logits = model(input_ids = input_ids, attention_mask=att_mask).logits
53
+ predicted_class_id = logits.argmax().item()
54
+ predicted_label = lookup_key.get(predicted_class_id)
55
+ return predicted_label
56
+
57
+
58
+ def main():
59
+
60
+ st.set_page_config(layout="wide", page_title="IT Service NLP Classification",)
61
+
62
+ with st.sidebar:
63
+ audio = mic_recorder(key='my_recorder', callback=callback)
64
+ button = st.button('start classification')
65
+
66
+ if button:
67
+ st.write('Loading')
68
+ text = transcribe(upload=audio["bytes"])
69
+ st.write(f'Speech-to-test Result:')
70
+ st.write(f'{text}')
71
+ model, tokenizer = load_model()
72
+ prediction = predict(text=text, model=model, tokenizer=tokenizer)
73
+ st.write(f'Classifcation Result:')
74
+ st.write(f'{prediction}')
75
+
76
+ if __name__ == '__main__':
77
+ main()
bert-itserviceclassification ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbed72a8eda5109d533406b8333c9b23d3f263d0369a0f301198d23ff84095cd
3
+ size 438035988