File size: 6,560 Bytes
b2bedce
7420aeb
 
 
 
 
 
 
b2bedce
 
 
 
 
 
 
 
dfd490b
b2bedce
 
7420aeb
 
 
 
 
 
b2bedce
 
dfd490b
 
 
7420aeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2bedce
 
 
7420aeb
 
b2bedce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from fastapi import FastAPI, Request, Form, File, UploadFile
from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles
from fastapi.responses import HTMLResponse
import pandas as pd
import requests
from bs4 import BeautifulSoup
from cleantext import clean
from docx import Document
import os
import cohere
import string
import numpy as np
from numpy.linalg import norm
from nltk.tokenize import SpaceTokenizer
import nltk
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from dotenv import load_dotenv
load_dotenv()

app = FastAPI()
app.mount("/static", StaticFiles(directory='static'), name="static")
templates = Jinja2Templates(directory="templates/")

onet = pd.read_csv('static/ONET_JobTitles.csv')
simdat = pd.read_csv('static/cohere_embeddings.csv')

model = AutoModelForSequenceClassification.from_pretrained('static/model_shards', low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained('static/tokenizer_shards', low_cpu_mem_usage=True)
classifier = pipeline('text-classification', model = model, tokenizer = tokenizer)

### job information center ###
# get
@app.get("/")
def render_job_list(request: Request):
    joblist = onet['JobTitle']
    return templates.TemplateResponse('job_list.html', context={'request': request, 'joblist': joblist})

# post
@app.post("/")
def render_job_info(request: Request, jobtitle: str = Form(enum=[x for x in onet['JobTitle']])):
    
    def remove_new_line(value):
        return ''.join(value.splitlines())

    joblist = onet['JobTitle']

    if jobtitle: 
        onetCode = onet.loc[onet['JobTitle'] == jobtitle, 'onetCode']
        onetCode = onetCode.reindex().tolist()[0]
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
        url = "https://www.onetonline.org/link/summary/" + onetCode
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        jobdescription = soup.p.get_text()
                
        url = "https://www.onetonline.org/link/result/" + onetCode + "?c=tk&n_tk=0&s_tk=IM&c_tk=0"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
        tasks = clean(tasks)
        tasks = tasks.split('show all show top 10')[1]
        tasks = tasks.split('occupations related to multiple tasks')[0]
        tasks = remove_new_line(tasks).replace("related occupations", " ").replace("core", " - ").replace(" )importance category task", "").replace(" find ", "")
        tasks = tasks.split(". ")
        tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
        return templates.TemplateResponse('job_list.html', context={
            'request': request, 
            'joblist': joblist, 
            'jobtitle': jobtitle, 
            'jobdescription': jobdescription, 
            'tasks': tasks})

### job neighborhoods ###
@app.get("/explore-job-neighborhoods/", response_class=HTMLResponse)
async def render_job_neighborhoods(request: Request):
    return templates.TemplateResponse('job_neighborhoods.html', context={'request': request})

### find my match ###
# get
@app.get("/find-my-match", response_class=HTMLResponse)
async def match_page(request: Request):
    return templates.TemplateResponse('find_my_match.html', context={'request': request})

# post
@app.post('/find-my-match', response_class=HTMLResponse)
def get_resume(request: Request, resume: UploadFile = File(...)):
    path = f"static/{resume.filename}"
    with open(path, 'wb') as buffer:
        buffer.write(resume.file.read())
    file = Document(path)
    text = []
    for para in file.paragraphs:
        text.append(para.text)
    resume = "\n".join(text)

    def clean_my_text(text):
        clean_text = ' '.join(text.splitlines())
        clean_text = clean_text.replace('-', " ").replace("/"," ")
        clean_text = clean(clean_text.translate(str.maketrans('', '', string.punctuation)))
        return clean_text

    def coSkillEmbed(text):
        co = cohere.Client(os.getenv("COHERE_TOKEN"))
        response = co.embed(
            model='large',
            texts=[text])
        return response.embeddings
    
    def cosine(A, B):
        return np.dot(A,B)/(norm(A)*norm(B))

    embeds = coSkillEmbed(resume)
    simResults = []

    for i in range(len(simdat)):
        simResults.append(cosine(np.array(embeds), np.array(simdat.iloc[i,1:])))
    simResults = pd.DataFrame(simResults)
    simResults['JobTitle'] = simdat['Title']

    simResults = simResults.iloc[:,[1,0]]
    simResults.columns = ['JobTitle', 'Similarity']
    simResults = simResults.sort_values(by = "Similarity", ascending = False)
    simResults = simResults.iloc[:13,:]
    simResults = simResults.iloc[1:,:]
    simResults.reset_index(drop=True, inplace=True)
    for x in range(len(simResults)):
        simResults.iloc[x,1] = "{:0.2f}".format(simResults.iloc[x,1])
        
    # EXTRACT SKILLS FROM RESUME 
    def skillNER(resume):
        resume = clean_my_text(resume)
        stops = set(nltk.corpus.stopwords.words('english'))
        stops = stops.union({'eg', 'ie', 'etc', 'experience', 'experiences', 'experienced', 'experiencing', 'knowledge', 
        'ability', 'abilities', 'skill', 'skills', 'skilled', 'including', 'includes', 'included', 'include'
        'education', 'follow', 'following', 'follows', 'followed', 'make', 'made', 'makes', 'making', 'maker',
        'available', 'large', 'larger', 'largescale', 'client', 'clients', 'responsible', 'x', 'many', 'team', 'teams'})
        resume = [word for word in SpaceTokenizer().tokenize(resume) if word not in stops]
        resume = [word for word in resume if ")" not in word]
        resume = [word for word in resume if "(" not in word]
        
        labels = []
        for i in range(len(resume)):
            classification = classifier(resume[i])[0]['label']
            if classification == 'LABEL_1':
                labels.append("Skill")
            else:
                labels.append("Not Skill")
            labels_dict = dict(zip(resume, labels))
        return labels_dict
    
    skills=skillNER(resume)

    return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults})