celise88 commited on
Commit
42ca295
·
1 Parent(s): fc51d61

add additional onet scraping functionality

Browse files
Files changed (3) hide show
  1. main.py +19 -2
  2. requirements.txt +2 -1
  3. scrape_onet.py +54 -17
main.py CHANGED
@@ -18,7 +18,7 @@ from mangum import Mangum
18
  from localStoragePy import localStoragePy
19
  localStorage = localStoragePy('pathfinder', 'text')
20
 
21
- from scrape_onet import get_onet_code, get_onet_description, get_onet_tasks, get_job_postings
22
  from match_utils import neighborhoods, get_resume, skillNER, sim_result_loop, get_links, coSkillEmbed, sim_result_loop_jobFinder, sim_result_loop_candFinder
23
  from user_utils import Hash
24
 
@@ -122,13 +122,30 @@ def post_job(request: Request, bt: BackgroundTasks, jobtitle: str = Form(enum=[x
122
  onetCode = get_onet_code(jobtitle)
123
  jobdescription = get_onet_description(onetCode)
124
  tasks = get_onet_tasks(onetCode)
 
 
 
 
 
 
 
 
 
125
  bt.add_task(neighborhoods, jobtitle)
126
  return templates.TemplateResponse('job_list.html', context={
127
  'request': request,
128
  'joblist': joblist,
129
  'jobtitle': jobtitle,
130
  'jobdescription': jobdescription,
131
- 'tasks': tasks})
 
 
 
 
 
 
 
 
132
 
133
  ### JOB NEIGHBORHOODS ###
134
  @app.get("/explore-job-neighborhoods/", response_class=HTMLResponse)
 
18
  from localStoragePy import localStoragePy
19
  localStorage = localStoragePy('pathfinder', 'text')
20
 
21
+ from scrape_onet import get_onet_code, get_onet_description, get_onet_tasks, get_onet_activities, get_onet_context, get_onet_skills, get_onet_knowledge, get_onet_abilities, get_onet_interests, get_onet_styles, get_onet_values, get_job_postings
22
  from match_utils import neighborhoods, get_resume, skillNER, sim_result_loop, get_links, coSkillEmbed, sim_result_loop_jobFinder, sim_result_loop_candFinder
23
  from user_utils import Hash
24
 
 
122
  onetCode = get_onet_code(jobtitle)
123
  jobdescription = get_onet_description(onetCode)
124
  tasks = get_onet_tasks(onetCode)
125
+ activities = get_onet_activities(onetCode)
126
+ context = get_onet_context(onetCode)
127
+ skills = get_onet_skills(onetCode)
128
+ knowledge = get_onet_knowledge(onetCode)
129
+ abilities = get_onet_abilities(onetCode)
130
+ interests = get_onet_interests(onetCode)
131
+ values = get_onet_values(onetCode)
132
+ styles = get_onet_styles(onetCode)
133
+
134
  bt.add_task(neighborhoods, jobtitle)
135
  return templates.TemplateResponse('job_list.html', context={
136
  'request': request,
137
  'joblist': joblist,
138
  'jobtitle': jobtitle,
139
  'jobdescription': jobdescription,
140
+ 'tasks': tasks,
141
+ 'activities': activities,
142
+ 'context': context,
143
+ 'knowledge': knowledge,
144
+ 'abilities': abilities,
145
+ 'skills': skills,
146
+ 'interests': interests,
147
+ 'values': values,
148
+ 'styles': styles})
149
 
150
  ### JOB NEIGHBORHOODS ###
151
  @app.get("/explore-job-neighborhoods/", response_class=HTMLResponse)
requirements.txt CHANGED
@@ -21,4 +21,5 @@ bcrypt==4.0.1
21
  passlib==1.7.4
22
  localStoragePy==0.2.3
23
  sentence-transformers==2.2.2
24
- mangum==0.17.0
 
 
21
  passlib==1.7.4
22
  localStoragePy==0.2.3
23
  sentence-transformers==2.2.2
24
+ mangum==0.17.0
25
+ certifi==2023.7.22
scrape_onet.py CHANGED
@@ -36,12 +36,11 @@ def get_onet_tasks(onetCode):
36
  tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
37
  return tasks
38
 
39
- def get_onet_ratings(onetCode):
40
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
41
 
42
  activities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wa&n_wa=0&s_wa=IM&c_wa=0"
43
- context_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=cx&n_cx=0&c_cx=0&s_cx=n"
44
-
45
  response = requests.get(activities_url, headers=headers)
46
  soup = BeautifulSoup(response.text, 'html.parser')
47
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
@@ -57,7 +56,14 @@ def get_onet_ratings(onetCode):
57
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(' ) ', '')])
58
  df = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
59
  df = df[df['Importance'] != '']
 
 
60
 
 
 
 
 
 
61
  response = requests.get(context_url, headers=headers)
62
  soup = BeautifulSoup(response.text, 'html.parser')
63
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
@@ -73,16 +79,14 @@ def get_onet_ratings(onetCode):
73
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
74
  df2 = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
75
  df2 = df2[df2['Importance'] != '']
76
-
77
- job_df = pd.concat([df, df2], axis = 0)
78
 
79
- skills_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=sk&n_sk=0&s_sk=IM&c_sk=0"
80
- knowledge_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=kn&n_kn=0&s_kn=IM&c_kn=0"
81
- abilities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ab&n_ab=0&s_ab=IM&c_ab=0"
82
- interests_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=in&c_in=0"
83
- values_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wv&c_wv=0"
84
- style_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ws&n_ws=0&c_ws=0"
85
 
 
 
86
  response = requests.get(skills_url, headers=headers)
87
  soup = BeautifulSoup(response.text, 'html.parser')
88
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
@@ -98,6 +102,13 @@ def get_onet_ratings(onetCode):
98
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
99
  df3 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
100
  df3 = df3[df3['Importance'] != '']
 
 
 
 
 
 
 
101
 
102
  response = requests.get(knowledge_url, headers=headers)
103
  soup = BeautifulSoup(response.text, 'html.parser')
@@ -114,7 +125,14 @@ def get_onet_ratings(onetCode):
114
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
115
  df4 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
116
  df4 = df4[df4['Importance'] != '']
117
-
 
 
 
 
 
 
 
118
  response = requests.get(abilities_url, headers=headers)
119
  soup = BeautifulSoup(response.text, 'html.parser')
120
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
@@ -130,7 +148,14 @@ def get_onet_ratings(onetCode):
130
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
131
  df5 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
132
  df5 = df5[df5['Importance'] != '']
133
-
 
 
 
 
 
 
 
134
  response = requests.get(interests_url, headers=headers)
135
  soup = BeautifulSoup(response.text, 'html.parser')
136
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
@@ -146,7 +171,14 @@ def get_onet_ratings(onetCode):
146
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
147
  df6 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
148
  df6 = df6[df6['Importance'] != '']
 
 
149
 
 
 
 
 
 
150
  response = requests.get(values_url, headers=headers)
151
  soup = BeautifulSoup(response.text, 'html.parser')
152
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
@@ -162,6 +194,13 @@ def get_onet_ratings(onetCode):
162
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
163
  df7 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
164
  df7 = df7[df7['Importance'] != '']
 
 
 
 
 
 
 
165
 
166
  response = requests.get(style_url, headers=headers)
167
  soup = BeautifulSoup(response.text, 'html.parser')
@@ -178,10 +217,8 @@ def get_onet_ratings(onetCode):
178
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
179
  df8 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
180
  df8 = df8[df8['Importance'] != '']
181
-
182
- cand_df = pd.concat([df3, df4, df5, df6, df7, df8], axis = 0)
183
-
184
- return [job_df, cand_df]
185
 
186
  def get_job_postings(onetCode, state):
187
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
 
36
  tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
37
  return tasks
38
 
39
+ def get_onet_activities(onetCode):
40
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
41
 
42
  activities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wa&n_wa=0&s_wa=IM&c_wa=0"
43
+
 
44
  response = requests.get(activities_url, headers=headers)
45
  soup = BeautifulSoup(response.text, 'html.parser')
46
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
 
56
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(' ) ', '')])
57
  df = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
58
  df = df[df['Importance'] != '']
59
+ activities = df
60
+ return activities
61
 
62
+ def get_onet_context(onetCode):
63
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
64
+
65
+ context_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=cx&n_cx=0&c_cx=0&s_cx=n"
66
+
67
  response = requests.get(context_url, headers=headers)
68
  soup = BeautifulSoup(response.text, 'html.parser')
69
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
 
79
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
80
  df2 = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
81
  df2 = df2[df2['Importance'] != '']
82
+ context = df2
83
+ return context
84
 
85
+ def get_onet_skills(onetCode):
86
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
 
 
 
 
87
 
88
+ skills_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=sk&n_sk=0&s_sk=IM&c_sk=0"
89
+
90
  response = requests.get(skills_url, headers=headers)
91
  soup = BeautifulSoup(response.text, 'html.parser')
92
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
 
102
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
103
  df3 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
104
  df3 = df3[df3['Importance'] != '']
105
+ skills = df3
106
+ return skills
107
+
108
+ def get_onet_knowledge(onetCode):
109
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
110
+
111
+ knowledge_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=kn&n_kn=0&s_kn=IM&c_kn=0"
112
 
113
  response = requests.get(knowledge_url, headers=headers)
114
  soup = BeautifulSoup(response.text, 'html.parser')
 
125
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
126
  df4 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
127
  df4 = df4[df4['Importance'] != '']
128
+ knowledge = df4
129
+ return knowledge
130
+
131
+ def get_onet_abilities(onetCode):
132
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
133
+
134
+ abilities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ab&n_ab=0&s_ab=IM&c_ab=0"
135
+
136
  response = requests.get(abilities_url, headers=headers)
137
  soup = BeautifulSoup(response.text, 'html.parser')
138
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
 
148
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
149
  df5 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
150
  df5 = df5[df5['Importance'] != '']
151
+ abilities = df5
152
+ return abilities
153
+
154
+ def get_onet_interests(onetCode):
155
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
156
+
157
+ interests_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=in&c_in=0"
158
+
159
  response = requests.get(interests_url, headers=headers)
160
  soup = BeautifulSoup(response.text, 'html.parser')
161
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
 
171
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
172
  df6 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
173
  df6 = df6[df6['Importance'] != '']
174
+ interests = df6
175
+ return interests
176
 
177
+ def get_onet_values(onetCode):
178
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
179
+
180
+ values_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wv&c_wv=0"
181
+
182
  response = requests.get(values_url, headers=headers)
183
  soup = BeautifulSoup(response.text, 'html.parser')
184
  tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
 
194
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
195
  df7 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
196
  df7 = df7[df7['Importance'] != '']
197
+ values = df7
198
+ return values
199
+
200
+ def get_onet_styles(onetCode):
201
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
202
+
203
+ style_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ws&n_ws=0&c_ws=0"
204
 
205
  response = requests.get(style_url, headers=headers)
206
  soup = BeautifulSoup(response.text, 'html.parser')
 
217
  num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
218
  df8 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
219
  df8 = df8[df8['Importance'] != '']
220
+ styles = df8
221
+ return styles
 
 
222
 
223
  def get_job_postings(onetCode, state):
224
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}