Spaces:

celise88
/

Pathfinder

Runtime error

App Files Files Community

celise88 commited on Feb 6, 2023

Commit

6d4e9bd

1 Parent(s): 870c9da

reorganize flow

Browse files

Files changed (3) hide show

README.md +3 -0
main.py +16 -50
utils.py +34 -0

README.md CHANGED Viewed

@@ -4,6 +4,9 @@ emoji: 🗺️
 colorFrom: blue
 colorTo: green
 sdk: docker
 pinned: true
 ---

 colorFrom: blue
 colorTo: green
 sdk: docker
+python_version: 3.10.9
+app_port: 7860
+models: celise88/distilbert-base-uncased-finetuned-binary-classifier
 pinned: true
 ---

main.py CHANGED Viewed

@@ -7,16 +7,10 @@ import requests
 from bs4 import BeautifulSoup
 from cleantext import clean
 from docx import Document
-import os
-import cohere
-import string
 import numpy as np
-from numpy.linalg import norm
-from nltk.tokenize import SpaceTokenizer
-import nltk
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
-from dotenv import load_dotenv
-load_dotenv()
 app = FastAPI()
 app.mount("/static", StaticFiles(directory='static'), name="static")
@@ -27,7 +21,6 @@ simdat = pd.read_csv('static/cohere_embeddings.csv')
 model = AutoModelForSequenceClassification.from_pretrained('static/model_shards', low_cpu_mem_usage=True)
 tokenizer = AutoTokenizer.from_pretrained('static/tokenizer_shards', low_cpu_mem_usage=True)
-classifier = pipeline('text-classification', model = model, tokenizer = tokenizer)
 ### job information center ###
 # get
@@ -85,6 +78,9 @@ async def match_page(request: Request):
 # post
 @app.post('/find-my-match', response_class=HTMLResponse)
 def get_resume(request: Request, resume: UploadFile = File(...)):
     path = f"static/{resume.filename}"
     with open(path, 'wb') as buffer:
         buffer.write(resume.file.read())
@@ -94,22 +90,6 @@ def get_resume(request: Request, resume: UploadFile = File(...)):
         text.append(para.text)
     resume = "\n".join(text)
-    def clean_my_text(text):
-        clean_text = ' '.join(text.splitlines())
-        clean_text = clean_text.replace('-', " ").replace("/"," ")
-        clean_text = clean(clean_text.translate(str.maketrans('', '', string.punctuation)))
-        return clean_text
-    def coSkillEmbed(text):
-        co = cohere.Client(os.getenv("COHERE_TOKEN"))
-        response = co.embed(
-            model='large',
-            texts=[text])
-        return response.embeddings
-    def cosine(A, B):
-        return np.dot(A,B)/(norm(A)*norm(B))
     embeds = coSkillEmbed(resume)
     simResults = []
@@ -126,29 +106,15 @@ def get_resume(request: Request, resume: UploadFile = File(...)):
     simResults.reset_index(drop=True, inplace=True)
     for x in range(len(simResults)):
         simResults.iloc[x,1] = "{:0.2f}".format(simResults.iloc[x,1])
-    # EXTRACT SKILLS FROM RESUME
-    def skillNER(resume):
-        resume = clean_my_text(resume)
-        stops = set(nltk.corpus.stopwords.words('english'))
-        stops = stops.union({'eg', 'ie', 'etc', 'experience', 'experiences', 'experienced', 'experiencing', 'knowledge',
-        'ability', 'abilities', 'skill', 'skills', 'skilled', 'including', 'includes', 'included', 'include'
-        'education', 'follow', 'following', 'follows', 'followed', 'make', 'made', 'makes', 'making', 'maker',
-        'available', 'large', 'larger', 'largescale', 'client', 'clients', 'responsible', 'x', 'many', 'team', 'teams'})
-        resume = [word for word in SpaceTokenizer().tokenize(resume) if word not in stops]
-        resume = [word for word in resume if ")" not in word]
-        resume = [word for word in resume if "(" not in word]
-        labels = []
-        for i in range(len(resume)):
-            classification = classifier(resume[i])[0]['label']
-            if classification == 'LABEL_1':
-                labels.append("Skill")
-            else:
-                labels.append("Not Skill")
-            labels_dict = dict(zip(resume, labels))
-        return labels_dict
-    skills=skillNER(resume)
-    return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults})

 from bs4 import BeautifulSoup
 from cleantext import clean
 from docx import Document
 import numpy as np
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+import utils
+from utils import coSkillEmbed, cosine, clean_my_text
 app = FastAPI()
 app.mount("/static", StaticFiles(directory='static'), name="static")
 model = AutoModelForSequenceClassification.from_pretrained('static/model_shards', low_cpu_mem_usage=True)
 tokenizer = AutoTokenizer.from_pretrained('static/tokenizer_shards', low_cpu_mem_usage=True)
 ### job information center ###
 # get
 # post
 @app.post('/find-my-match', response_class=HTMLResponse)
 def get_resume(request: Request, resume: UploadFile = File(...)):
+    classifier = pipeline('text-classification', model = model, tokenizer = tokenizer)
     path = f"static/{resume.filename}"
     with open(path, 'wb') as buffer:
         buffer.write(resume.file.read())
         text.append(para.text)
     resume = "\n".join(text)
     embeds = coSkillEmbed(resume)
     simResults = []
     simResults.reset_index(drop=True, inplace=True)
     for x in range(len(simResults)):
         simResults.iloc[x,1] = "{:0.2f}".format(simResults.iloc[x,1])
+    cleantext = clean_my_text(resume)
+    labels = []
+    for i in range(len(cleantext)):
+        classification = classifier(cleantext[i])[0]['label']
+        if classification == 'LABEL_1':
+            labels.append("Skill")
+        else:
+            labels.append("Not Skill")
+        skills = dict(zip(cleantext, labels))
+    return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults})

utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from cleantext import clean
+import cohere
+import string
+import numpy as np
+from numpy.linalg import norm
+from nltk.tokenize import SpaceTokenizer
+import nltk
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def coSkillEmbed(text):
+    co = cohere.Client(os.getenv("COHERE_TOKEN"))
+    response = co.embed(
+        model='large',
+        texts=[text])
+    return response.embeddings
+def cosine(A, B):
+    return np.dot(A,B)/(norm(A)*norm(B))
+def clean_my_text(resume):
+    clean_text = ' '.join(resume.splitlines())
+    clean_text = clean_text.replace('-', " ").replace("/"," ")
+    clean_text = clean(clean_text.translate(str.maketrans('', '', string.punctuation)))
+    stops = set(nltk.corpus.stopwords.words('english'))
+    stops = stops.union({'eg', 'ie', 'etc', 'experience', 'experiences', 'experienced', 'experiencing', 'knowledge',
+    'ability', 'abilities', 'skill', 'skills', 'skilled', 'including', 'includes', 'included', 'include'
+    'education', 'follow', 'following', 'follows', 'followed', 'make', 'made', 'makes', 'making', 'maker',
+    'available', 'large', 'larger', 'largescale', 'client', 'clients', 'responsible', 'x', 'many', 'team', 'teams'})
+    resume = [word for word in SpaceTokenizer().tokenize(resume) if word not in stops]
+    resume = [word for word in resume if ")" not in word]
+    resume = [word for word in resume if "(" not in word]
+    return resume