Spaces:

suvadityamuk
/

resume-rag

Running on Zero

App Files Files Community

resume-rag / utils.py

suvadityamuk

chore: download onnx-data on spaces

01e09d8 about 1 month ago

raw

history blame contribute delete

5.6 kB

	import gdown
	import os
	import numpy as np
	import torch
	import onnxruntime
	from urllib.parse import urlparse, parse_qs, urljoin
	import requests
	from bs4 import BeautifulSoup
	import time
	from collections import deque

	def download_pdf_from_gdrive(url, output_path=None):
	"""
	Download a PDF file from Google Drive using the provided sharing URL.

	Parameters:
	url (str): The Google Drive sharing URL of the PDF file
	output_path (str, optional): The path where the PDF should be saved.
	If not provided, saves in current directory.

	Returns:
	str: Path to the downloaded file if successful, None if failed

	Raises:
	ValueError: If the URL is invalid or doesn't point to a Google Drive file
	"""
	try:
	# Check if URL is provided
	if not url:
	raise ValueError("URL cannot be empty")

	# Handle different types of Google Drive URLs
	if 'drive.google.com' not in url:
	raise ValueError("Not a valid Google Drive URL")

	# Extract file ID from the URL
	if '/file/d/' in url:
	file_id = url.split('/file/d/')[1].split('/')[0]
	elif 'id=' in url:
	file_id = parse_qs(urlparse(url).query)['id'][0]
	else:
	raise ValueError("Could not extract file ID from the URL")

	# Set default output path if none provided
	if not output_path:
	output_path = 'downloaded_file.pdf'

	# Ensure the output path ends with .pdf
	if not output_path.lower().endswith('.pdf'):
	output_path += '.pdf'

	# Create the directory if it doesn't exist
	os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)

	# Download the file
	output = gdown.download(id=file_id, output=output_path, quiet=False)

	if output is None:
	raise ValueError("Download failed - file might be inaccessible or not exist")

	return output

	except Exception as e:
	print(f"Error downloading PDF: {str(e)}")
	return None

	def merge_strings_with_prefix(strings):
	"""Merges strings in a list that start with a specific prefix.

	Args:
	strings: A list of strings.

	Returns:
	A new list of merged strings.
	"""

	result = []
	current_merged_string = ""

	for string in strings:
	if string.startswith("•"):
	if current_merged_string:
	result.append(current_merged_string)
	current_merged_string = string
	else:
	current_merged_string += string

	if current_merged_string:
	result.append(current_merged_string)

	return ' '.join(result)

	def scrape_website(start_url, delay=1):
	"""
	Scrapes all pages of a website and returns their content as a single string.

	Args:
	start_url (str): The starting URL of the website
	delay (int): Delay between requests in seconds to be polite

	Returns:
	str: Combined content from all pages
	"""
	# Initialize sets for tracking
	visited_urls = set()
	domain = urlparse(start_url).netloc
	queue = deque([start_url])
	all_content = []

	def is_valid_url(url):
	"""Check if URL belongs to the same domain and is a webpage"""
	parsed = urlparse(url)
	return (
	parsed.netloc == domain and
	parsed.path.split('.')[-1] not in ['pdf', 'jpg', 'png', 'gif', 'jpeg'] and
	'#' not in url
	)

	def extract_text_content(soup):
	"""Extract meaningful text content from a BeautifulSoup object"""
	# Remove script and style elements
	for script in soup(["script", "style", "header", "footer", "nav"]):
	script.decompose()

	# Get text content
	text = soup.get_text(separator=' ', strip=True)

	# Clean up whitespace
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = ' '.join(chunk for chunk in chunks if chunk)

	return text

	def get_links(soup, base_url):
	"""Extract all valid links from a page"""
	links = []
	for a_tag in soup.find_all('a', href=True):
	url = urljoin(base_url, a_tag['href'])
	if is_valid_url(url):
	links.append(url)
	return links

	headers = {
	'User-Agent': 'Mozilla/5.0'
	}

	# Main scraping loop
	while queue:
	url = queue.popleft()
	if url in visited_urls:
	continue

	try:
	print(f"Scraping: {url}")
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract content
	content = extract_text_content(soup)
	all_content.append(f"URL: {url}\n{content}\n")

	# Add new links to queue
	links = get_links(soup, url)
	for link in links:
	if link not in visited_urls:
	queue.append(link)

	visited_urls.add(url)
	time.sleep(delay) # Be polite

	except Exception as e:
	print(f"Error scraping {url}: {str(e)}")
	continue

	# Combine all content into a single string
	combined_content = "\n\n".join(all_content)
	return combined_content