Spaces:
Runtime error
Runtime error
add additional onet scraping functionality
Browse files- main.py +19 -2
- requirements.txt +2 -1
- scrape_onet.py +54 -17
main.py
CHANGED
@@ -18,7 +18,7 @@ from mangum import Mangum
|
|
18 |
from localStoragePy import localStoragePy
|
19 |
localStorage = localStoragePy('pathfinder', 'text')
|
20 |
|
21 |
-
from scrape_onet import get_onet_code, get_onet_description, get_onet_tasks, get_job_postings
|
22 |
from match_utils import neighborhoods, get_resume, skillNER, sim_result_loop, get_links, coSkillEmbed, sim_result_loop_jobFinder, sim_result_loop_candFinder
|
23 |
from user_utils import Hash
|
24 |
|
@@ -122,13 +122,30 @@ def post_job(request: Request, bt: BackgroundTasks, jobtitle: str = Form(enum=[x
|
|
122 |
onetCode = get_onet_code(jobtitle)
|
123 |
jobdescription = get_onet_description(onetCode)
|
124 |
tasks = get_onet_tasks(onetCode)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
bt.add_task(neighborhoods, jobtitle)
|
126 |
return templates.TemplateResponse('job_list.html', context={
|
127 |
'request': request,
|
128 |
'joblist': joblist,
|
129 |
'jobtitle': jobtitle,
|
130 |
'jobdescription': jobdescription,
|
131 |
-
'tasks': tasks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
### JOB NEIGHBORHOODS ###
|
134 |
@app.get("/explore-job-neighborhoods/", response_class=HTMLResponse)
|
|
|
18 |
from localStoragePy import localStoragePy
|
19 |
localStorage = localStoragePy('pathfinder', 'text')
|
20 |
|
21 |
+
from scrape_onet import get_onet_code, get_onet_description, get_onet_tasks, get_onet_activities, get_onet_context, get_onet_skills, get_onet_knowledge, get_onet_abilities, get_onet_interests, get_onet_styles, get_onet_values, get_job_postings
|
22 |
from match_utils import neighborhoods, get_resume, skillNER, sim_result_loop, get_links, coSkillEmbed, sim_result_loop_jobFinder, sim_result_loop_candFinder
|
23 |
from user_utils import Hash
|
24 |
|
|
|
122 |
onetCode = get_onet_code(jobtitle)
|
123 |
jobdescription = get_onet_description(onetCode)
|
124 |
tasks = get_onet_tasks(onetCode)
|
125 |
+
activities = get_onet_activities(onetCode)
|
126 |
+
context = get_onet_context(onetCode)
|
127 |
+
skills = get_onet_skills(onetCode)
|
128 |
+
knowledge = get_onet_knowledge(onetCode)
|
129 |
+
abilities = get_onet_abilities(onetCode)
|
130 |
+
interests = get_onet_interests(onetCode)
|
131 |
+
values = get_onet_values(onetCode)
|
132 |
+
styles = get_onet_styles(onetCode)
|
133 |
+
|
134 |
bt.add_task(neighborhoods, jobtitle)
|
135 |
return templates.TemplateResponse('job_list.html', context={
|
136 |
'request': request,
|
137 |
'joblist': joblist,
|
138 |
'jobtitle': jobtitle,
|
139 |
'jobdescription': jobdescription,
|
140 |
+
'tasks': tasks,
|
141 |
+
'activities': activities,
|
142 |
+
'context': context,
|
143 |
+
'knowledge': knowledge,
|
144 |
+
'abilities': abilities,
|
145 |
+
'skills': skills,
|
146 |
+
'interests': interests,
|
147 |
+
'values': values,
|
148 |
+
'styles': styles})
|
149 |
|
150 |
### JOB NEIGHBORHOODS ###
|
151 |
@app.get("/explore-job-neighborhoods/", response_class=HTMLResponse)
|
requirements.txt
CHANGED
@@ -21,4 +21,5 @@ bcrypt==4.0.1
|
|
21 |
passlib==1.7.4
|
22 |
localStoragePy==0.2.3
|
23 |
sentence-transformers==2.2.2
|
24 |
-
mangum==0.17.0
|
|
|
|
21 |
passlib==1.7.4
|
22 |
localStoragePy==0.2.3
|
23 |
sentence-transformers==2.2.2
|
24 |
+
mangum==0.17.0
|
25 |
+
certifi==2023.7.22
|
scrape_onet.py
CHANGED
@@ -36,12 +36,11 @@ def get_onet_tasks(onetCode):
|
|
36 |
tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
|
37 |
return tasks
|
38 |
|
39 |
-
def
|
40 |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
41 |
|
42 |
activities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wa&n_wa=0&s_wa=IM&c_wa=0"
|
43 |
-
|
44 |
-
|
45 |
response = requests.get(activities_url, headers=headers)
|
46 |
soup = BeautifulSoup(response.text, 'html.parser')
|
47 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
@@ -57,7 +56,14 @@ def get_onet_ratings(onetCode):
|
|
57 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(' ) ', '')])
|
58 |
df = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
|
59 |
df = df[df['Importance'] != '']
|
|
|
|
|
60 |
|
|
|
|
|
|
|
|
|
|
|
61 |
response = requests.get(context_url, headers=headers)
|
62 |
soup = BeautifulSoup(response.text, 'html.parser')
|
63 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
@@ -73,16 +79,14 @@ def get_onet_ratings(onetCode):
|
|
73 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
74 |
df2 = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
|
75 |
df2 = df2[df2['Importance'] != '']
|
76 |
-
|
77 |
-
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
abilities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ab&n_ab=0&s_ab=IM&c_ab=0"
|
82 |
-
interests_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=in&c_in=0"
|
83 |
-
values_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wv&c_wv=0"
|
84 |
-
style_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ws&n_ws=0&c_ws=0"
|
85 |
|
|
|
|
|
86 |
response = requests.get(skills_url, headers=headers)
|
87 |
soup = BeautifulSoup(response.text, 'html.parser')
|
88 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
@@ -98,6 +102,13 @@ def get_onet_ratings(onetCode):
|
|
98 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
99 |
df3 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
100 |
df3 = df3[df3['Importance'] != '']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
response = requests.get(knowledge_url, headers=headers)
|
103 |
soup = BeautifulSoup(response.text, 'html.parser')
|
@@ -114,7 +125,14 @@ def get_onet_ratings(onetCode):
|
|
114 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
115 |
df4 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
116 |
df4 = df4[df4['Importance'] != '']
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
response = requests.get(abilities_url, headers=headers)
|
119 |
soup = BeautifulSoup(response.text, 'html.parser')
|
120 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
@@ -130,7 +148,14 @@ def get_onet_ratings(onetCode):
|
|
130 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
131 |
df5 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
132 |
df5 = df5[df5['Importance'] != '']
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
response = requests.get(interests_url, headers=headers)
|
135 |
soup = BeautifulSoup(response.text, 'html.parser')
|
136 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
@@ -146,7 +171,14 @@ def get_onet_ratings(onetCode):
|
|
146 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
147 |
df6 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
148 |
df6 = df6[df6['Importance'] != '']
|
|
|
|
|
149 |
|
|
|
|
|
|
|
|
|
|
|
150 |
response = requests.get(values_url, headers=headers)
|
151 |
soup = BeautifulSoup(response.text, 'html.parser')
|
152 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
@@ -162,6 +194,13 @@ def get_onet_ratings(onetCode):
|
|
162 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
163 |
df7 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
164 |
df7 = df7[df7['Importance'] != '']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
response = requests.get(style_url, headers=headers)
|
167 |
soup = BeautifulSoup(response.text, 'html.parser')
|
@@ -178,10 +217,8 @@ def get_onet_ratings(onetCode):
|
|
178 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
179 |
df8 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
180 |
df8 = df8[df8['Importance'] != '']
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
return [job_df, cand_df]
|
185 |
|
186 |
def get_job_postings(onetCode, state):
|
187 |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
|
|
36 |
tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
|
37 |
return tasks
|
38 |
|
39 |
+
def get_onet_activities(onetCode):
|
40 |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
41 |
|
42 |
activities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wa&n_wa=0&s_wa=IM&c_wa=0"
|
43 |
+
|
|
|
44 |
response = requests.get(activities_url, headers=headers)
|
45 |
soup = BeautifulSoup(response.text, 'html.parser')
|
46 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
|
|
56 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(' ) ', '')])
|
57 |
df = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
|
58 |
df = df[df['Importance'] != '']
|
59 |
+
activities = df
|
60 |
+
return activities
|
61 |
|
62 |
+
def get_onet_context(onetCode):
|
63 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
64 |
+
|
65 |
+
context_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=cx&n_cx=0&c_cx=0&s_cx=n"
|
66 |
+
|
67 |
response = requests.get(context_url, headers=headers)
|
68 |
soup = BeautifulSoup(response.text, 'html.parser')
|
69 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
|
|
79 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
80 |
df2 = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
|
81 |
df2 = df2[df2['Importance'] != '']
|
82 |
+
context = df2
|
83 |
+
return context
|
84 |
|
85 |
+
def get_onet_skills(onetCode):
|
86 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
skills_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=sk&n_sk=0&s_sk=IM&c_sk=0"
|
89 |
+
|
90 |
response = requests.get(skills_url, headers=headers)
|
91 |
soup = BeautifulSoup(response.text, 'html.parser')
|
92 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
|
|
102 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
103 |
df3 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
104 |
df3 = df3[df3['Importance'] != '']
|
105 |
+
skills = df3
|
106 |
+
return skills
|
107 |
+
|
108 |
+
def get_onet_knowledge(onetCode):
|
109 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
110 |
+
|
111 |
+
knowledge_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=kn&n_kn=0&s_kn=IM&c_kn=0"
|
112 |
|
113 |
response = requests.get(knowledge_url, headers=headers)
|
114 |
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
125 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
126 |
df4 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
127 |
df4 = df4[df4['Importance'] != '']
|
128 |
+
knowledge = df4
|
129 |
+
return knowledge
|
130 |
+
|
131 |
+
def get_onet_abilities(onetCode):
|
132 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
133 |
+
|
134 |
+
abilities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ab&n_ab=0&s_ab=IM&c_ab=0"
|
135 |
+
|
136 |
response = requests.get(abilities_url, headers=headers)
|
137 |
soup = BeautifulSoup(response.text, 'html.parser')
|
138 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
|
|
148 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
149 |
df5 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
150 |
df5 = df5[df5['Importance'] != '']
|
151 |
+
abilities = df5
|
152 |
+
return abilities
|
153 |
+
|
154 |
+
def get_onet_interests(onetCode):
|
155 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
156 |
+
|
157 |
+
interests_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=in&c_in=0"
|
158 |
+
|
159 |
response = requests.get(interests_url, headers=headers)
|
160 |
soup = BeautifulSoup(response.text, 'html.parser')
|
161 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
|
|
171 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
172 |
df6 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
173 |
df6 = df6[df6['Importance'] != '']
|
174 |
+
interests = df6
|
175 |
+
return interests
|
176 |
|
177 |
+
def get_onet_values(onetCode):
|
178 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
179 |
+
|
180 |
+
values_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wv&c_wv=0"
|
181 |
+
|
182 |
response = requests.get(values_url, headers=headers)
|
183 |
soup = BeautifulSoup(response.text, 'html.parser')
|
184 |
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
|
|
194 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
195 |
df7 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
196 |
df7 = df7[df7['Importance'] != '']
|
197 |
+
values = df7
|
198 |
+
return values
|
199 |
+
|
200 |
+
def get_onet_styles(onetCode):
|
201 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
202 |
+
|
203 |
+
style_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ws&n_ws=0&c_ws=0"
|
204 |
|
205 |
response = requests.get(style_url, headers=headers)
|
206 |
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
217 |
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
218 |
df8 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
219 |
df8 = df8[df8['Importance'] != '']
|
220 |
+
styles = df8
|
221 |
+
return styles
|
|
|
|
|
222 |
|
223 |
def get_job_postings(onetCode, state):
|
224 |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|