Spaces:

celise88
/

Pathfinder

Runtime error

App Files Files Community

celise88 commited on Jan 29, 2024

Commit

e121dec

1 Parent(s): c8742c4

improving embeddings

Browse files

Files changed (3) hide show

README.md +2 -2
main.py +14 -14
match_utils.py +2 -2

README.md CHANGED Viewed

@@ -13,7 +13,7 @@ pinned: true
 ![logo](./static/PF.png)
 ## Purpose:
-#### This is a FastAPI web application designed to allow job-seekers to learn more about various occupations and explore their future career path. See below for details and page descriptions. If you like the app, please star and/or fork and check back for future releases.
 ## To Access the App:
 https://huggingface.co/spaces/celise88/Pathfinder
@@ -67,4 +67,4 @@ And navigate to http://localhost:8000/ in your browser
     * The classification model underlying the skills extractor is a custom distilbert-base-uncased binary classification model that was finetuned using a balanced dataset comprised of the emsi (now Lightcast) open skills database and a random sample of the dbpedia database. The model achieved an f1 score of 0.967 on the validation sample (accuracy of 0.967, loss of 0.096). It can be accessed via Hugging Face: https://huggingface.co/celise88/distilbert-base-uncased-finetuned-binary-classifier.
     * Cohere's LLM is used to get the neural text embeddings. (This is why a cohere API key is needed for the new functionality to work in this release; I plan to incorporate an open-source embedding model in a future release.)
-* Version 1.1.2 (current version) - 1/29/2024 - Migrated from finetuned Distilbert LLM to Mistral (see https://huggingface.co/mistralai/Mistral-7B-v0.1 for more information).

 ![logo](./static/PF.png)
 ## Purpose:
+#### This is a FastAPI web application designed to allow job-seekers to learn more about various occupations and explore their future career path. See below for details and page descriptions. If you like the app, please star and/or fork and check back for future releases. Please note that this is a work in progress and may yield unexpected job match results - we are currently moving away from using Cohere.ai's embedding model and have not yet found a replacement that achieves acceptable performance.
 ## To Access the App:
 https://huggingface.co/spaces/celise88/Pathfinder
     * The classification model underlying the skills extractor is a custom distilbert-base-uncased binary classification model that was finetuned using a balanced dataset comprised of the emsi (now Lightcast) open skills database and a random sample of the dbpedia database. The model achieved an f1 score of 0.967 on the validation sample (accuracy of 0.967, loss of 0.096). It can be accessed via Hugging Face: https://huggingface.co/celise88/distilbert-base-uncased-finetuned-binary-classifier.
     * Cohere's LLM is used to get the neural text embeddings. (This is why a cohere API key is needed for the new functionality to work in this release; I plan to incorporate an open-source embedding model in a future release.)
+* Version 1.1.2 (current version) - 1/29/2024 - Migrated from a finetuned Distilbert LLM and Cohere.ai's embedding model to Mistral (see https://huggingface.co/mistralai/Mistral-7B-v0.1 for more information).

main.py CHANGED Viewed

@@ -170,24 +170,24 @@ async def post_matches(request: Request, bt: BackgroundTasks, resume: UploadFile
     username = localStorage.getItem('username')
-    def add_data_to_db(resume):
         db = pd.read_csv('static/res_embeddings.csv')
-        embeds = format(skillEmbed(resume)).replace('[[','').replace(']]','').replace('[','').replace(']','').split(',')
         db.iloc[db['username']== username,5:] = embeds
         db.to_csv('static/res_embeddings.csv', index=False)
-    def get_jobs_from_db(resume):
-        job_matches = sim_result_loop_jobFinder(resume)
         print(job_matches)
     resume = get_resume(resume)
     skills = skill_extractor(resume)
-    simResults = await sim_result_loop(resume)
     links = get_links(simResults[0])
     if username is not None:
-        bt.add_task(add_data_to_db, resume)
-        bt.add_task(get_jobs_from_db, resume)
     return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults[0], 'links': links, 'statelist': statelist})
@@ -211,24 +211,24 @@ async def post_matches(request: Request, bt: BackgroundTasks, jobdesc: UploadFil
     username = localStorage.getItem('username')
-    def add_data_to_db(jobdesc):
         db = pd.read_csv('static/jd_embeddings.csv')
-        embeds = format(skillEmbed(jobdesc)).replace('[[','').replace(']]','').split(',')
         db.iloc[db['username']== username,5:] = embeds
         db.to_csv('static/jd_embeddings.csv', index=False)
-    def get_cand_from_db(jobdesc):
-        cand_matches = sim_result_loop_candFinder(jobdesc)
         print(cand_matches)
     jobdesc = get_resume(jobdesc)
     skills = skill_extractor(jobdesc)
-    simResults = await sim_result_loop(jobdesc)
     links = get_links(simResults[0])
     if username is not None:
-        bt.add_task(add_data_to_db, jobdesc)
-        bt.add_task(get_cand_from_db, jobdesc)
     return templates.TemplateResponse('candidate_matcher.html', context={'request': request, 'jobdesc': jobdesc, 'skills': skills, 'simResults': simResults[0], 'links': links})

     username = localStorage.getItem('username')
+    def add_data_to_db(skills):
         db = pd.read_csv('static/res_embeddings.csv')
+        embeds = format(skillEmbed(skills)).replace('[[','').replace(']]','').replace('[','').replace(']','').split(',')
         db.iloc[db['username']== username,5:] = embeds
         db.to_csv('static/res_embeddings.csv', index=False)
+    def get_jobs_from_db(skills):
+        job_matches = sim_result_loop_jobFinder(skills)
         print(job_matches)
     resume = get_resume(resume)
     skills = skill_extractor(resume)
+    simResults = await sim_result_loop(skills)
     links = get_links(simResults[0])
     if username is not None:
+        bt.add_task(add_data_to_db, skills)
+        bt.add_task(get_jobs_from_db, skills)
     return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults[0], 'links': links, 'statelist': statelist})
     username = localStorage.getItem('username')
+    def add_data_to_db(skills):
         db = pd.read_csv('static/jd_embeddings.csv')
+        embeds = format(skillEmbed(skills)).replace('[[','').replace(']]','').split(',')
         db.iloc[db['username']== username,5:] = embeds
         db.to_csv('static/jd_embeddings.csv', index=False)
+    def get_cand_from_db(skills):
+        cand_matches = sim_result_loop_candFinder(skills)
         print(cand_matches)
     jobdesc = get_resume(jobdesc)
     skills = skill_extractor(jobdesc)
+    simResults = await sim_result_loop(skills)
     links = get_links(simResults[0])
     if username is not None:
+        bt.add_task(add_data_to_db, skills)
+        bt.add_task(get_cand_from_db, skills)
     return templates.TemplateResponse('candidate_matcher.html', context={'request': request, 'jobdesc': jobdesc, 'skills': skills, 'simResults': simResults[0], 'links': links})

match_utils.py CHANGED Viewed

@@ -19,8 +19,8 @@ except AttributeError:
 else:
     ssl._create_default_https_context = _create_unverified_https_context
-# LOAD COHERE EMBEDDINGS:
-simdat = pd.read_csv('static/embeddings/cohere_embeddings.csv')
 coheredat = pd.read_csv('static/cohere_tSNE_dat.csv')
 # LOAD LLM MODELS:

 else:
     ssl._create_default_https_context = _create_unverified_https_context
+# LOAD EMBEDDINGS:
+simdat = pd.read_csv('static/embeddings/onet_embeddings.csv')
 coheredat = pd.read_csv('static/cohere_tSNE_dat.csv')
 # LOAD LLM MODELS: