celise88 commited on
Commit
e121dec
·
1 Parent(s): c8742c4

improving embeddings

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. main.py +14 -14
  3. match_utils.py +2 -2
README.md CHANGED
@@ -13,7 +13,7 @@ pinned: true
13
  ![logo](./static/PF.png)
14
 
15
  ## Purpose:
16
- #### This is a FastAPI web application designed to allow job-seekers to learn more about various occupations and explore their future career path. See below for details and page descriptions. If you like the app, please star and/or fork and check back for future releases.
17
 
18
  ## To Access the App:
19
  https://huggingface.co/spaces/celise88/Pathfinder
@@ -67,4 +67,4 @@ And navigate to http://localhost:8000/ in your browser
67
  * The classification model underlying the skills extractor is a custom distilbert-base-uncased binary classification model that was finetuned using a balanced dataset comprised of the emsi (now Lightcast) open skills database and a random sample of the dbpedia database. The model achieved an f1 score of 0.967 on the validation sample (accuracy of 0.967, loss of 0.096). It can be accessed via Hugging Face: https://huggingface.co/celise88/distilbert-base-uncased-finetuned-binary-classifier.
68
  * Cohere's LLM is used to get the neural text embeddings. (This is why a cohere API key is needed for the new functionality to work in this release; I plan to incorporate an open-source embedding model in a future release.)
69
 
70
- * Version 1.1.2 (current version) - 1/29/2024 - Migrated from finetuned Distilbert LLM to Mistral (see https://huggingface.co/mistralai/Mistral-7B-v0.1 for more information).
 
13
  ![logo](./static/PF.png)
14
 
15
  ## Purpose:
16
+ #### This is a FastAPI web application designed to allow job-seekers to learn more about various occupations and explore their future career path. See below for details and page descriptions. If you like the app, please star and/or fork and check back for future releases. Please note that this is a work in progress and may yield unexpected job match results - we are currently moving away from using Cohere.ai's embedding model and have not yet found a replacement that achieves acceptable performance.
17
 
18
  ## To Access the App:
19
  https://huggingface.co/spaces/celise88/Pathfinder
 
67
  * The classification model underlying the skills extractor is a custom distilbert-base-uncased binary classification model that was finetuned using a balanced dataset comprised of the emsi (now Lightcast) open skills database and a random sample of the dbpedia database. The model achieved an f1 score of 0.967 on the validation sample (accuracy of 0.967, loss of 0.096). It can be accessed via Hugging Face: https://huggingface.co/celise88/distilbert-base-uncased-finetuned-binary-classifier.
68
  * Cohere's LLM is used to get the neural text embeddings. (This is why a cohere API key is needed for the new functionality to work in this release; I plan to incorporate an open-source embedding model in a future release.)
69
 
70
+ * Version 1.1.2 (current version) - 1/29/2024 - Migrated from a finetuned Distilbert LLM and Cohere.ai's embedding model to Mistral (see https://huggingface.co/mistralai/Mistral-7B-v0.1 for more information).
main.py CHANGED
@@ -170,24 +170,24 @@ async def post_matches(request: Request, bt: BackgroundTasks, resume: UploadFile
170
 
171
  username = localStorage.getItem('username')
172
 
173
- def add_data_to_db(resume):
174
  db = pd.read_csv('static/res_embeddings.csv')
175
- embeds = format(skillEmbed(resume)).replace('[[','').replace(']]','').replace('[','').replace(']','').split(',')
176
  db.iloc[db['username']== username,5:] = embeds
177
  db.to_csv('static/res_embeddings.csv', index=False)
178
 
179
- def get_jobs_from_db(resume):
180
- job_matches = sim_result_loop_jobFinder(resume)
181
  print(job_matches)
182
 
183
  resume = get_resume(resume)
184
  skills = skill_extractor(resume)
185
- simResults = await sim_result_loop(resume)
186
  links = get_links(simResults[0])
187
 
188
  if username is not None:
189
- bt.add_task(add_data_to_db, resume)
190
- bt.add_task(get_jobs_from_db, resume)
191
 
192
  return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults[0], 'links': links, 'statelist': statelist})
193
 
@@ -211,24 +211,24 @@ async def post_matches(request: Request, bt: BackgroundTasks, jobdesc: UploadFil
211
 
212
  username = localStorage.getItem('username')
213
 
214
- def add_data_to_db(jobdesc):
215
  db = pd.read_csv('static/jd_embeddings.csv')
216
- embeds = format(skillEmbed(jobdesc)).replace('[[','').replace(']]','').split(',')
217
  db.iloc[db['username']== username,5:] = embeds
218
  db.to_csv('static/jd_embeddings.csv', index=False)
219
 
220
- def get_cand_from_db(jobdesc):
221
- cand_matches = sim_result_loop_candFinder(jobdesc)
222
  print(cand_matches)
223
 
224
  jobdesc = get_resume(jobdesc)
225
  skills = skill_extractor(jobdesc)
226
- simResults = await sim_result_loop(jobdesc)
227
  links = get_links(simResults[0])
228
 
229
  if username is not None:
230
- bt.add_task(add_data_to_db, jobdesc)
231
- bt.add_task(get_cand_from_db, jobdesc)
232
 
233
  return templates.TemplateResponse('candidate_matcher.html', context={'request': request, 'jobdesc': jobdesc, 'skills': skills, 'simResults': simResults[0], 'links': links})
234
 
 
170
 
171
  username = localStorage.getItem('username')
172
 
173
+ def add_data_to_db(skills):
174
  db = pd.read_csv('static/res_embeddings.csv')
175
+ embeds = format(skillEmbed(skills)).replace('[[','').replace(']]','').replace('[','').replace(']','').split(',')
176
  db.iloc[db['username']== username,5:] = embeds
177
  db.to_csv('static/res_embeddings.csv', index=False)
178
 
179
+ def get_jobs_from_db(skills):
180
+ job_matches = sim_result_loop_jobFinder(skills)
181
  print(job_matches)
182
 
183
  resume = get_resume(resume)
184
  skills = skill_extractor(resume)
185
+ simResults = await sim_result_loop(skills)
186
  links = get_links(simResults[0])
187
 
188
  if username is not None:
189
+ bt.add_task(add_data_to_db, skills)
190
+ bt.add_task(get_jobs_from_db, skills)
191
 
192
  return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults[0], 'links': links, 'statelist': statelist})
193
 
 
211
 
212
  username = localStorage.getItem('username')
213
 
214
+ def add_data_to_db(skills):
215
  db = pd.read_csv('static/jd_embeddings.csv')
216
+ embeds = format(skillEmbed(skills)).replace('[[','').replace(']]','').split(',')
217
  db.iloc[db['username']== username,5:] = embeds
218
  db.to_csv('static/jd_embeddings.csv', index=False)
219
 
220
+ def get_cand_from_db(skills):
221
+ cand_matches = sim_result_loop_candFinder(skills)
222
  print(cand_matches)
223
 
224
  jobdesc = get_resume(jobdesc)
225
  skills = skill_extractor(jobdesc)
226
+ simResults = await sim_result_loop(skills)
227
  links = get_links(simResults[0])
228
 
229
  if username is not None:
230
+ bt.add_task(add_data_to_db, skills)
231
+ bt.add_task(get_cand_from_db, skills)
232
 
233
  return templates.TemplateResponse('candidate_matcher.html', context={'request': request, 'jobdesc': jobdesc, 'skills': skills, 'simResults': simResults[0], 'links': links})
234
 
match_utils.py CHANGED
@@ -19,8 +19,8 @@ except AttributeError:
19
  else:
20
  ssl._create_default_https_context = _create_unverified_https_context
21
 
22
- # LOAD COHERE EMBEDDINGS:
23
- simdat = pd.read_csv('static/embeddings/cohere_embeddings.csv')
24
  coheredat = pd.read_csv('static/cohere_tSNE_dat.csv')
25
 
26
  # LOAD LLM MODELS:
 
19
  else:
20
  ssl._create_default_https_context = _create_unverified_https_context
21
 
22
+ # LOAD EMBEDDINGS:
23
+ simdat = pd.read_csv('static/embeddings/onet_embeddings.csv')
24
  coheredat = pd.read_csv('static/cohere_tSNE_dat.csv')
25
 
26
  # LOAD LLM MODELS: