|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
from googlesearch import search |
|
from duckduckgo_search import DDGS |
|
import concurrent.futures |
|
import re |
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_duckduckgo(query): |
|
print("Fetching Duckduckgo Links -----") |
|
try: |
|
results = DDGS().text(f"{query} manual filetype:pdf", max_results=5) |
|
return [res['href'] for res in results] |
|
except: |
|
return [] |
|
|
|
|
|
def search_google(query): |
|
print("Fetching Google Links -----") |
|
|
|
links = [] |
|
try: |
|
api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c' |
|
search_engine_id = 'c4ca951b9fc6949cb' |
|
|
|
url = f"https://www.googleapis.com/customsearch/v1" |
|
params = { |
|
"key": api_key, |
|
"cx": search_engine_id, |
|
"q": query + " manual filetype:pdf" |
|
} |
|
|
|
response = requests.get(url, params=params) |
|
results = response.json() |
|
|
|
for item in results.get('items', []): |
|
links.append(item['link']) |
|
except: |
|
pass |
|
|
|
try: |
|
extension = "ext:pdf" |
|
for result in search(query + " manual " + extension, num_results=5): |
|
if result.endswith('.pdf'): |
|
links.append(result) |
|
except: |
|
pass |
|
|
|
return links |
|
|
|
|
|
def search_archive(query): |
|
print("Fetching Archive Links -----") |
|
|
|
try: |
|
url = "https://archive.org/advancedsearch.php" |
|
params = { |
|
'q': f'{query} manual', |
|
'fl[]': ['identifier', 'title', 'format'], |
|
'rows': 50, |
|
'page': 1, |
|
'output': 'json' |
|
} |
|
|
|
|
|
response = requests.get(url, params=params) |
|
data = response.json() |
|
|
|
|
|
def extract_hyperlinks(url): |
|
|
|
response = requests.get(url) |
|
|
|
|
|
if response.status_code == 200: |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
for link in soup.find_all('a', href=True): |
|
href = link['href'] |
|
if href.endswith('.pdf'): |
|
pdf_files.append(url+'/'+href) |
|
if href.endswith('.iso'): |
|
|
|
extract_pdf_from_iso(url+'/'+href+'/') |
|
|
|
|
|
def extract_pdf_from_iso(iso_url): |
|
|
|
iso_response = requests.get(iso_url) |
|
|
|
|
|
if iso_response.status_code == 200: |
|
|
|
iso_soup = BeautifulSoup(iso_response.text, 'html.parser') |
|
|
|
|
|
for link in iso_soup.find_all('a', href=True): |
|
href = link['href'] |
|
if href.endswith('.pdf'): |
|
pdf_files.append('https:'+href) |
|
|
|
pdf_files = [] |
|
|
|
def process_doc(doc): |
|
identifier = doc.get('identifier', 'N/A') |
|
|
|
|
|
pdf_link = f"https://archive.org/download/{identifier}" |
|
extract_hyperlinks(pdf_link) |
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']] |
|
|
|
|
|
for future in concurrent.futures.as_completed(futures): |
|
try: |
|
future.result() |
|
except Exception as exc: |
|
print(f'Generated an exception: {exc}') |
|
|
|
|
|
return pdf_files |
|
|
|
except: |
|
return [] |
|
|
|
def search_github(query): |
|
print("Fetching Github Links -----") |
|
|
|
try: |
|
|
|
url = f"https://api.github.com/search/code?q={query}+extension:md" |
|
|
|
headers = { |
|
'Authorization': 'Token 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' |
|
} |
|
|
|
|
|
response = requests.get(url,headers=headers) |
|
data = response.json() |
|
links = [item['html_url'] for item in data['items']] |
|
|
|
return links |
|
|
|
except: |
|
return [] |
|
|
|
def search_wikipedia(product): |
|
print("Fetching Wikipedia Links -----") |
|
|
|
api_url = "https://en.wikipedia.org/w/api.php" |
|
params = { |
|
"action": "opensearch", |
|
"search": product, |
|
"limit": 5, |
|
"namespace": 0, |
|
"format": "json" |
|
} |
|
|
|
try: |
|
response = requests.get(api_url, params=params) |
|
response.raise_for_status() |
|
data = response.json() |
|
|
|
if data and len(data) > 3 and len(data[3]) > 0: |
|
return data[3] |
|
else: |
|
return [] |
|
|
|
except requests.RequestException as e: |
|
print(f"An error occurred: {e}") |
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_similar_products(query): |
|
print(f"\nFetching similar items of -----> {query}") |
|
results = DDGS().chat(f'{query} Similar Products') |
|
|
|
pattern = r'^\d+\.\s(.+)$' |
|
matches = re.findall(pattern, results, re.MULTILINE) |
|
matches = [item.split(': ')[0] for item in matches] |
|
return matches |
|
|
|
|