celise88 commited on
Commit
6d4e9bd
·
1 Parent(s): 870c9da

reorganize flow

Browse files
Files changed (3) hide show
  1. README.md +3 -0
  2. main.py +16 -50
  3. utils.py +34 -0
README.md CHANGED
@@ -4,6 +4,9 @@ emoji: 🗺️
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: docker
 
 
 
7
  pinned: true
8
  ---
9
 
 
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: docker
7
+ python_version: 3.10.9
8
+ app_port: 7860
9
+ models: celise88/distilbert-base-uncased-finetuned-binary-classifier
10
  pinned: true
11
  ---
12
 
main.py CHANGED
@@ -7,16 +7,10 @@ import requests
7
  from bs4 import BeautifulSoup
8
  from cleantext import clean
9
  from docx import Document
10
- import os
11
- import cohere
12
- import string
13
  import numpy as np
14
- from numpy.linalg import norm
15
- from nltk.tokenize import SpaceTokenizer
16
- import nltk
17
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
18
- from dotenv import load_dotenv
19
- load_dotenv()
20
 
21
  app = FastAPI()
22
  app.mount("/static", StaticFiles(directory='static'), name="static")
@@ -27,7 +21,6 @@ simdat = pd.read_csv('static/cohere_embeddings.csv')
27
 
28
  model = AutoModelForSequenceClassification.from_pretrained('static/model_shards', low_cpu_mem_usage=True)
29
  tokenizer = AutoTokenizer.from_pretrained('static/tokenizer_shards', low_cpu_mem_usage=True)
30
- classifier = pipeline('text-classification', model = model, tokenizer = tokenizer)
31
 
32
  ### job information center ###
33
  # get
@@ -85,6 +78,9 @@ async def match_page(request: Request):
85
  # post
86
  @app.post('/find-my-match', response_class=HTMLResponse)
87
  def get_resume(request: Request, resume: UploadFile = File(...)):
 
 
 
88
  path = f"static/{resume.filename}"
89
  with open(path, 'wb') as buffer:
90
  buffer.write(resume.file.read())
@@ -94,22 +90,6 @@ def get_resume(request: Request, resume: UploadFile = File(...)):
94
  text.append(para.text)
95
  resume = "\n".join(text)
96
 
97
- def clean_my_text(text):
98
- clean_text = ' '.join(text.splitlines())
99
- clean_text = clean_text.replace('-', " ").replace("/"," ")
100
- clean_text = clean(clean_text.translate(str.maketrans('', '', string.punctuation)))
101
- return clean_text
102
-
103
- def coSkillEmbed(text):
104
- co = cohere.Client(os.getenv("COHERE_TOKEN"))
105
- response = co.embed(
106
- model='large',
107
- texts=[text])
108
- return response.embeddings
109
-
110
- def cosine(A, B):
111
- return np.dot(A,B)/(norm(A)*norm(B))
112
-
113
  embeds = coSkillEmbed(resume)
114
  simResults = []
115
 
@@ -126,29 +106,15 @@ def get_resume(request: Request, resume: UploadFile = File(...)):
126
  simResults.reset_index(drop=True, inplace=True)
127
  for x in range(len(simResults)):
128
  simResults.iloc[x,1] = "{:0.2f}".format(simResults.iloc[x,1])
129
-
130
- # EXTRACT SKILLS FROM RESUME
131
- def skillNER(resume):
132
- resume = clean_my_text(resume)
133
- stops = set(nltk.corpus.stopwords.words('english'))
134
- stops = stops.union({'eg', 'ie', 'etc', 'experience', 'experiences', 'experienced', 'experiencing', 'knowledge',
135
- 'ability', 'abilities', 'skill', 'skills', 'skilled', 'including', 'includes', 'included', 'include'
136
- 'education', 'follow', 'following', 'follows', 'followed', 'make', 'made', 'makes', 'making', 'maker',
137
- 'available', 'large', 'larger', 'largescale', 'client', 'clients', 'responsible', 'x', 'many', 'team', 'teams'})
138
- resume = [word for word in SpaceTokenizer().tokenize(resume) if word not in stops]
139
- resume = [word for word in resume if ")" not in word]
140
- resume = [word for word in resume if "(" not in word]
141
-
142
- labels = []
143
- for i in range(len(resume)):
144
- classification = classifier(resume[i])[0]['label']
145
- if classification == 'LABEL_1':
146
- labels.append("Skill")
147
- else:
148
- labels.append("Not Skill")
149
- labels_dict = dict(zip(resume, labels))
150
- return labels_dict
151
 
152
- skills=skillNER(resume)
153
-
154
- return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults})
 
 
 
 
 
 
 
 
 
7
  from bs4 import BeautifulSoup
8
  from cleantext import clean
9
  from docx import Document
 
 
 
10
  import numpy as np
 
 
 
11
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
12
+ import utils
13
+ from utils import coSkillEmbed, cosine, clean_my_text
14
 
15
  app = FastAPI()
16
  app.mount("/static", StaticFiles(directory='static'), name="static")
 
21
 
22
  model = AutoModelForSequenceClassification.from_pretrained('static/model_shards', low_cpu_mem_usage=True)
23
  tokenizer = AutoTokenizer.from_pretrained('static/tokenizer_shards', low_cpu_mem_usage=True)
 
24
 
25
  ### job information center ###
26
  # get
 
78
  # post
79
  @app.post('/find-my-match', response_class=HTMLResponse)
80
  def get_resume(request: Request, resume: UploadFile = File(...)):
81
+
82
+ classifier = pipeline('text-classification', model = model, tokenizer = tokenizer)
83
+
84
  path = f"static/{resume.filename}"
85
  with open(path, 'wb') as buffer:
86
  buffer.write(resume.file.read())
 
90
  text.append(para.text)
91
  resume = "\n".join(text)
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  embeds = coSkillEmbed(resume)
94
  simResults = []
95
 
 
106
  simResults.reset_index(drop=True, inplace=True)
107
  for x in range(len(simResults)):
108
  simResults.iloc[x,1] = "{:0.2f}".format(simResults.iloc[x,1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ cleantext = clean_my_text(resume)
111
+ labels = []
112
+ for i in range(len(cleantext)):
113
+ classification = classifier(cleantext[i])[0]['label']
114
+ if classification == 'LABEL_1':
115
+ labels.append("Skill")
116
+ else:
117
+ labels.append("Not Skill")
118
+ skills = dict(zip(cleantext, labels))
119
+
120
+ return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults})
utils.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from cleantext import clean
2
+ import cohere
3
+ import string
4
+ import numpy as np
5
+ from numpy.linalg import norm
6
+ from nltk.tokenize import SpaceTokenizer
7
+ import nltk
8
+ import os
9
+ from dotenv import load_dotenv
10
+ load_dotenv()
11
+
12
+ def coSkillEmbed(text):
13
+ co = cohere.Client(os.getenv("COHERE_TOKEN"))
14
+ response = co.embed(
15
+ model='large',
16
+ texts=[text])
17
+ return response.embeddings
18
+
19
+ def cosine(A, B):
20
+ return np.dot(A,B)/(norm(A)*norm(B))
21
+
22
+ def clean_my_text(resume):
23
+ clean_text = ' '.join(resume.splitlines())
24
+ clean_text = clean_text.replace('-', " ").replace("/"," ")
25
+ clean_text = clean(clean_text.translate(str.maketrans('', '', string.punctuation)))
26
+ stops = set(nltk.corpus.stopwords.words('english'))
27
+ stops = stops.union({'eg', 'ie', 'etc', 'experience', 'experiences', 'experienced', 'experiencing', 'knowledge',
28
+ 'ability', 'abilities', 'skill', 'skills', 'skilled', 'including', 'includes', 'included', 'include'
29
+ 'education', 'follow', 'following', 'follows', 'followed', 'make', 'made', 'makes', 'making', 'maker',
30
+ 'available', 'large', 'larger', 'largescale', 'client', 'clients', 'responsible', 'x', 'many', 'team', 'teams'})
31
+ resume = [word for word in SpaceTokenizer().tokenize(resume) if word not in stops]
32
+ resume = [word for word in resume if ")" not in word]
33
+ resume = [word for word in resume if "(" not in word]
34
+ return resume