srinuksv commited on
Commit
8106f26
·
verified ·
1 Parent(s): 890a670

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -11
app.py CHANGED
@@ -1,25 +1,37 @@
1
  import time
2
- import pandas as pd
3
- import re
4
  from fastapi import FastAPI
5
  from fastapi.responses import HTMLResponse
6
  from fastapi.staticfiles import StaticFiles
7
  from selenium import webdriver
8
- from selenium.webdriver.chrome.service import Service
9
  from selenium.webdriver.common.by import By
10
- from webdriver_manager.chrome import ChromeDriverManager
11
  from selenium.webdriver.chrome.options import Options
 
12
 
13
  app = FastAPI()
14
 
 
 
 
15
  # Serve static files
16
  app.mount("/static", StaticFiles(directory="static"), name="static")
17
 
18
  def scrape_upwork_data(search_query, num_jobs, page):
19
- options = Options()
20
- options.add_argument("--headless") # Run in headless mode for faster scraping
21
- service = Service(ChromeDriverManager().install())
22
- driver = webdriver.Chrome(service=service, options=options)
 
 
 
 
 
 
 
 
 
 
23
 
24
  job_listings = []
25
  try:
@@ -44,7 +56,8 @@ def scrape_upwork_data(search_query, num_jobs, page):
44
  # Check for budget (fixed price or hourly)
45
  try:
46
  budget = job_info.find_element(By.CSS_SELECTOR, 'li[data-test="is-fixed-price"]').text.strip()
47
- except:
 
48
  budget = job_info.find_element(By.CSS_SELECTOR, 'li[data-test="duration-label"]').text.strip()
49
 
50
  job_listings.append({
@@ -58,7 +71,7 @@ def scrape_upwork_data(search_query, num_jobs, page):
58
  })
59
 
60
  except Exception as e:
61
- print(f'Error parsing job listing: {e}')
62
 
63
  finally:
64
  driver.quit()
@@ -86,7 +99,7 @@ async def read_root():
86
  @app.get("/jobs", response_class=HTMLResponse)
87
  async def get_jobs(query: str, num_jobs: int = 50):
88
  jobs = []
89
- for page in range(1, 3): # Change to however many pages you want to scrape
90
  job_listings = scrape_upwork_data(query, num_jobs, page)
91
  jobs.extend(job_listings)
92
 
 
1
  import time
2
+ import logging
 
3
  from fastapi import FastAPI
4
  from fastapi.responses import HTMLResponse
5
  from fastapi.staticfiles import StaticFiles
6
  from selenium import webdriver
 
7
  from selenium.webdriver.common.by import By
8
+ from selenium.webdriver.chrome.service import Service
9
  from selenium.webdriver.chrome.options import Options
10
+ from webdriver_manager.chrome import ChromeDriverManager
11
 
12
  app = FastAPI()
13
 
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO)
16
+
17
  # Serve static files
18
  app.mount("/static", StaticFiles(directory="static"), name="static")
19
 
20
  def scrape_upwork_data(search_query, num_jobs, page):
21
+ # Setup Chrome options for remote WebDriver
22
+ chrome_options = webdriver.ChromeOptions()
23
+ chrome_options.add_argument("--headless")
24
+ chrome_options.add_argument("--window-size=1920x1080")
25
+ chrome_options.add_argument("--disable-gpu")
26
+ chrome_options.add_argument("--disable-extensions")
27
+ chrome_options.add_argument("--no-sandbox")
28
+ chrome_options.add_argument("--disable-dev-shm-usage")
29
+
30
+ # Setup the remote WebDriver
31
+ driver = webdriver.Remote(
32
+ command_executor='http://localhost:4444/wd/hub',
33
+ options=chrome_options
34
+ )
35
 
36
  job_listings = []
37
  try:
 
56
  # Check for budget (fixed price or hourly)
57
  try:
58
  budget = job_info.find_element(By.CSS_SELECTOR, 'li[data-test="is-fixed-price"]').text.strip()
59
+ except Exception as e:
60
+ logging.error(f'Error finding budget: {e}')
61
  budget = job_info.find_element(By.CSS_SELECTOR, 'li[data-test="duration-label"]').text.strip()
62
 
63
  job_listings.append({
 
71
  })
72
 
73
  except Exception as e:
74
+ logging.error(f'Error parsing job listing: {e}')
75
 
76
  finally:
77
  driver.quit()
 
99
  @app.get("/jobs", response_class=HTMLResponse)
100
  async def get_jobs(query: str, num_jobs: int = 50):
101
  jobs = []
102
+ for page in range(1, (num_jobs // 50) + 1): # Scrape as many pages as needed
103
  job_listings = scrape_upwork_data(query, num_jobs, page)
104
  jobs.extend(job_listings)
105