Spaces:
Runtime error
Runtime error
File size: 6,560 Bytes
b2bedce 7420aeb b2bedce dfd490b b2bedce 7420aeb b2bedce dfd490b 7420aeb b2bedce 7420aeb b2bedce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
from fastapi import FastAPI, Request, Form, File, UploadFile
from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles
from fastapi.responses import HTMLResponse
import pandas as pd
import requests
from bs4 import BeautifulSoup
from cleantext import clean
from docx import Document
import os
import cohere
import string
import numpy as np
from numpy.linalg import norm
from nltk.tokenize import SpaceTokenizer
import nltk
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from dotenv import load_dotenv
load_dotenv()
app = FastAPI()
app.mount("/static", StaticFiles(directory='static'), name="static")
templates = Jinja2Templates(directory="templates/")
onet = pd.read_csv('static/ONET_JobTitles.csv')
simdat = pd.read_csv('static/cohere_embeddings.csv')
model = AutoModelForSequenceClassification.from_pretrained('static/model_shards', low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained('static/tokenizer_shards', low_cpu_mem_usage=True)
classifier = pipeline('text-classification', model = model, tokenizer = tokenizer)
### job information center ###
# get
@app.get("/")
def render_job_list(request: Request):
joblist = onet['JobTitle']
return templates.TemplateResponse('job_list.html', context={'request': request, 'joblist': joblist})
# post
@app.post("/")
def render_job_info(request: Request, jobtitle: str = Form(enum=[x for x in onet['JobTitle']])):
def remove_new_line(value):
return ''.join(value.splitlines())
joblist = onet['JobTitle']
if jobtitle:
onetCode = onet.loc[onet['JobTitle'] == jobtitle, 'onetCode']
onetCode = onetCode.reindex().tolist()[0]
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
url = "https://www.onetonline.org/link/summary/" + onetCode
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
jobdescription = soup.p.get_text()
url = "https://www.onetonline.org/link/result/" + onetCode + "?c=tk&n_tk=0&s_tk=IM&c_tk=0"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
tasks = clean(tasks)
tasks = tasks.split('show all show top 10')[1]
tasks = tasks.split('occupations related to multiple tasks')[0]
tasks = remove_new_line(tasks).replace("related occupations", " ").replace("core", " - ").replace(" )importance category task", "").replace(" find ", "")
tasks = tasks.split(". ")
tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
return templates.TemplateResponse('job_list.html', context={
'request': request,
'joblist': joblist,
'jobtitle': jobtitle,
'jobdescription': jobdescription,
'tasks': tasks})
### job neighborhoods ###
@app.get("/explore-job-neighborhoods/", response_class=HTMLResponse)
async def render_job_neighborhoods(request: Request):
return templates.TemplateResponse('job_neighborhoods.html', context={'request': request})
### find my match ###
# get
@app.get("/find-my-match", response_class=HTMLResponse)
async def match_page(request: Request):
return templates.TemplateResponse('find_my_match.html', context={'request': request})
# post
@app.post('/find-my-match', response_class=HTMLResponse)
def get_resume(request: Request, resume: UploadFile = File(...)):
path = f"static/{resume.filename}"
with open(path, 'wb') as buffer:
buffer.write(resume.file.read())
file = Document(path)
text = []
for para in file.paragraphs:
text.append(para.text)
resume = "\n".join(text)
def clean_my_text(text):
clean_text = ' '.join(text.splitlines())
clean_text = clean_text.replace('-', " ").replace("/"," ")
clean_text = clean(clean_text.translate(str.maketrans('', '', string.punctuation)))
return clean_text
def coSkillEmbed(text):
co = cohere.Client(os.getenv("COHERE_TOKEN"))
response = co.embed(
model='large',
texts=[text])
return response.embeddings
def cosine(A, B):
return np.dot(A,B)/(norm(A)*norm(B))
embeds = coSkillEmbed(resume)
simResults = []
for i in range(len(simdat)):
simResults.append(cosine(np.array(embeds), np.array(simdat.iloc[i,1:])))
simResults = pd.DataFrame(simResults)
simResults['JobTitle'] = simdat['Title']
simResults = simResults.iloc[:,[1,0]]
simResults.columns = ['JobTitle', 'Similarity']
simResults = simResults.sort_values(by = "Similarity", ascending = False)
simResults = simResults.iloc[:13,:]
simResults = simResults.iloc[1:,:]
simResults.reset_index(drop=True, inplace=True)
for x in range(len(simResults)):
simResults.iloc[x,1] = "{:0.2f}".format(simResults.iloc[x,1])
# EXTRACT SKILLS FROM RESUME
def skillNER(resume):
resume = clean_my_text(resume)
stops = set(nltk.corpus.stopwords.words('english'))
stops = stops.union({'eg', 'ie', 'etc', 'experience', 'experiences', 'experienced', 'experiencing', 'knowledge',
'ability', 'abilities', 'skill', 'skills', 'skilled', 'including', 'includes', 'included', 'include'
'education', 'follow', 'following', 'follows', 'followed', 'make', 'made', 'makes', 'making', 'maker',
'available', 'large', 'larger', 'largescale', 'client', 'clients', 'responsible', 'x', 'many', 'team', 'teams'})
resume = [word for word in SpaceTokenizer().tokenize(resume) if word not in stops]
resume = [word for word in resume if ")" not in word]
resume = [word for word in resume if "(" not in word]
labels = []
for i in range(len(resume)):
classification = classifier(resume[i])[0]['label']
if classification == 'LABEL_1':
labels.append("Skill")
else:
labels.append("Not Skill")
labels_dict = dict(zip(resume, labels))
return labels_dict
skills=skillNER(resume)
return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults})
|