aiws / search_engine.py
fikird
Update duckduckgo-search implementation and fix imports
d7b6953
raw
history blame
5.89 kB
from typing import Dict, List, Any
import requests
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from transformers import pipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
import time
import json
import os
from urllib.parse import urlparse
class ModelManager:
"""Manages different AI models for specific tasks"""
def __init__(self):
self.device = "cpu"
self.models = {}
self.load_models()
def load_models(self):
# Use smaller models for CPU deployment
self.models['summarizer'] = pipeline(
"summarization",
model="facebook/bart-base",
device=self.device
)
self.models['embeddings'] = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={"device": self.device}
)
class ContentProcessor:
"""Processes and analyzes different types of content"""
def __init__(self):
self.model_manager = ModelManager()
def process_content(self, content: str) -> Dict:
"""Process content and generate insights"""
try:
# Generate summary
summary = self.model_manager.models['summarizer'](
content[:1024],
max_length=100,
min_length=30,
do_sample=False
)[0]['summary_text']
return {
'summary': summary,
'content': content
}
except Exception as e:
return {
'summary': f"Error processing content: {str(e)}",
'content': content
}
class WebSearchEngine:
"""Main search engine class"""
def __init__(self):
self.processor = ContentProcessor()
self.session = requests.Session()
self.request_delay = 1.0
self.last_request_time = 0
self.ddgs = DDGS()
def is_valid_url(self, url: str) -> bool:
"""Check if URL is valid for crawling"""
try:
parsed = urlparse(url)
return bool(parsed.netloc and parsed.scheme)
except:
return False
def get_metadata(self, soup: BeautifulSoup) -> Dict:
"""Extract metadata from page"""
title = soup.title.string if soup.title else "No title"
description = ""
if soup.find("meta", attrs={"name": "description"}):
description = soup.find("meta", attrs={"name": "description"}).get("content", "")
return {
'title': title,
'description': description
}
def process_url(self, url: str) -> Dict:
"""Process a single URL"""
if not self.is_valid_url(url):
return {'error': f"Invalid URL: {url}"}
try:
# Rate limiting
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < self.request_delay:
time.sleep(self.request_delay - time_since_last)
response = self.session.get(url, timeout=10)
self.last_request_time = time.time()
if response.status_code != 200:
return {'error': f"Failed to fetch URL: {url}, status code: {response.status_code}"}
soup = BeautifulSoup(response.text, 'lxml')
# Extract text content
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
content = ' '.join(chunk for chunk in chunks if chunk)
# Get metadata
metadata = self.get_metadata(soup)
# Process content
processed = self.processor.process_content(content)
return {
'url': url,
'title': metadata['title'],
'description': metadata['description'],
'summary': processed['summary'],
'content': processed['content']
}
except Exception as e:
return {'error': f"Error processing {url}: {str(e)}"}
def search(self, query: str, max_results: int = 5) -> Dict:
"""Perform search and process results"""
try:
# Search using DuckDuckGo
search_results = list(self.ddgs.text(query, max_results=max_results))
results = []
for result in search_results:
if 'link' in result:
processed = self.process_url(result['link'])
if 'error' not in processed:
results.append(processed)
# Generate insights from results
all_content = " ".join([r['summary'] for r in results if 'summary' in r])
return {
'results': results,
'insights': all_content[:1000] if all_content else "No insights available.",
'follow_up_questions': [
f"What are the key differences between {query} and related topics?",
f"Can you explain {query} in simple terms?",
f"What are the latest developments in {query}?"
]
}
except Exception as e:
return {'error': f"Search failed: {str(e)}"}
# Main search function
def search(query: str, max_results: int = 5) -> Dict:
"""Main search function"""
engine = WebSearchEngine()
return engine.search(query, max_results)