Spaces:
Build error
Build error
fikird
commited on
Commit
·
44198e0
0
Parent(s):
Add RAG functionality with vector storage and web crawling
Browse files- .gitignore +41 -0
- README.md +80 -0
- app.py +155 -0
- packages.txt +4 -0
- rag_engine.py +93 -0
- requirements.txt +14 -0
- search_engine.py +174 -0
- space.yml +11 -0
.gitignore
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
*.egg-info/
|
20 |
+
.installed.cfg
|
21 |
+
*.egg
|
22 |
+
|
23 |
+
# Virtual Environment
|
24 |
+
venv/
|
25 |
+
env/
|
26 |
+
ENV/
|
27 |
+
|
28 |
+
# IDE
|
29 |
+
.idea/
|
30 |
+
.vscode/
|
31 |
+
*.swp
|
32 |
+
*.swo
|
33 |
+
|
34 |
+
# OS
|
35 |
+
.DS_Store
|
36 |
+
Thumbs.db
|
37 |
+
|
38 |
+
# Project specific
|
39 |
+
*.log
|
40 |
+
cache/
|
41 |
+
.env
|
README.md
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🔍 Intelligent Web Search Engine
|
2 |
+
|
3 |
+
An advanced AI-powered search engine that provides deep understanding of web content, code analysis, and intelligent insights.
|
4 |
+
|
5 |
+
## 🌟 Features
|
6 |
+
|
7 |
+
- Multi-model AI analysis
|
8 |
+
- Semantic search and caching
|
9 |
+
- Automatic insights generation
|
10 |
+
- Smart follow-up questions
|
11 |
+
- Code-aware analysis
|
12 |
+
- Related searches
|
13 |
+
|
14 |
+
## 🚀 Deployment to Hugging Face Spaces
|
15 |
+
|
16 |
+
1. Create a new Space:
|
17 |
+
- Go to [huggingface.co/spaces](https://huggingface.co/spaces)
|
18 |
+
- Click "Create new Space"
|
19 |
+
- Choose "Gradio" as the SDK
|
20 |
+
- Select "CPU" as the hardware
|
21 |
+
- Name your space (e.g., "intelligent-web-search")
|
22 |
+
|
23 |
+
2. Upload Files:
|
24 |
+
- Upload all files from the `aiws` directory
|
25 |
+
- Make sure to include:
|
26 |
+
- `app.py`
|
27 |
+
- `search_engine.py`
|
28 |
+
- `requirements.txt`
|
29 |
+
- `packages.txt`
|
30 |
+
|
31 |
+
3. Space Settings:
|
32 |
+
- Go to the "Settings" tab
|
33 |
+
- Under "Repository secrets", add any required API keys
|
34 |
+
- Under "Variables", set:
|
35 |
+
```
|
36 |
+
PYTHON_PACKAGES_PATH=/home/user/.local/lib/python3.9/site-packages
|
37 |
+
```
|
38 |
+
|
39 |
+
4. The space will automatically build and deploy your app
|
40 |
+
|
41 |
+
## 📦 Local Development
|
42 |
+
|
43 |
+
1. Clone the repository:
|
44 |
+
```bash
|
45 |
+
git clone [your-repo-url]
|
46 |
+
cd aiws
|
47 |
+
```
|
48 |
+
|
49 |
+
2. Install dependencies:
|
50 |
+
```bash
|
51 |
+
pip install -r requirements.txt
|
52 |
+
```
|
53 |
+
|
54 |
+
3. Run the app:
|
55 |
+
```bash
|
56 |
+
python app.py
|
57 |
+
```
|
58 |
+
|
59 |
+
## 🔧 Configuration
|
60 |
+
|
61 |
+
The search engine uses several AI models:
|
62 |
+
- Summarization: facebook/bart-base
|
63 |
+
- Code Understanding: Salesforce/codet5-small
|
64 |
+
- General QA: google/flan-t5-base
|
65 |
+
- Embeddings: sentence-transformers/all-MiniLM-L6-v2
|
66 |
+
|
67 |
+
## 📝 Usage
|
68 |
+
|
69 |
+
1. Enter your search query
|
70 |
+
2. Adjust the maximum number of results (1-20)
|
71 |
+
3. Click "Search"
|
72 |
+
4. View results including:
|
73 |
+
- Key insights
|
74 |
+
- Follow-up questions
|
75 |
+
- Detailed analysis
|
76 |
+
- Related searches
|
77 |
+
|
78 |
+
## 🤝 Contributing
|
79 |
+
|
80 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
app.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from rag_engine import RAGEngine
|
3 |
+
import torch
|
4 |
+
import os
|
5 |
+
import logging
|
6 |
+
import traceback
|
7 |
+
import asyncio
|
8 |
+
|
9 |
+
# Configure logging
|
10 |
+
logging.basicConfig(
|
11 |
+
level=logging.INFO,
|
12 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
13 |
+
)
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
def safe_search(query, max_results):
|
17 |
+
"""Wrapper function to handle errors gracefully"""
|
18 |
+
try:
|
19 |
+
rag = RAGEngine()
|
20 |
+
results = asyncio.run(rag.search_and_process(query, max_results))
|
21 |
+
return format_results(results)
|
22 |
+
except Exception as e:
|
23 |
+
error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
|
24 |
+
logger.error(error_msg)
|
25 |
+
return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
|
26 |
+
|
27 |
+
def format_results(results):
|
28 |
+
"""Format search results for display"""
|
29 |
+
if not results:
|
30 |
+
return "# ⚠️ No Results\nNo search results were found. Please try a different query."
|
31 |
+
|
32 |
+
formatted = f"# 🔍 Search Results\n\n"
|
33 |
+
|
34 |
+
# Add insights section
|
35 |
+
if 'insights' in results:
|
36 |
+
formatted += f"## 💡 Key Insights\n{results['insights']}\n\n"
|
37 |
+
|
38 |
+
# Add follow-up questions
|
39 |
+
if 'follow_up_questions' in results:
|
40 |
+
formatted += "## ❓ Follow-up Questions\n"
|
41 |
+
for q in results['follow_up_questions']:
|
42 |
+
if q and q.strip():
|
43 |
+
formatted += f"- {q.strip()}\n"
|
44 |
+
formatted += "\n"
|
45 |
+
|
46 |
+
# Add main results
|
47 |
+
if 'results' in results:
|
48 |
+
formatted += "## 📄 Detailed Results\n\n"
|
49 |
+
for i, result in enumerate(results['results'], 1):
|
50 |
+
formatted += f"### {i}. "
|
51 |
+
if 'url' in result:
|
52 |
+
formatted += f"[{result.get('title', 'Untitled')}]({result['url']})\n"
|
53 |
+
else:
|
54 |
+
formatted += f"{result.get('title', 'Untitled')}\n"
|
55 |
+
|
56 |
+
if result.get('processed_content'):
|
57 |
+
content = result['processed_content']
|
58 |
+
if 'summary' in content:
|
59 |
+
formatted += f"**Summary:** {content['summary']}\n\n"
|
60 |
+
if content.get('metadata', {}).get('description'):
|
61 |
+
formatted += f"**Description:** {content['metadata']['description']}\n\n"
|
62 |
+
if content.get('content_type') == 'code':
|
63 |
+
formatted += f"**Code Analysis:** {content.get('explanation', '')}\n\n"
|
64 |
+
else:
|
65 |
+
formatted += f"**Detailed Explanation:** {content.get('explanation', '')}\n\n"
|
66 |
+
|
67 |
+
if 'snippet' in result:
|
68 |
+
formatted += f"**Snippet:** {result['snippet']}\n\n"
|
69 |
+
formatted += "---\n\n"
|
70 |
+
|
71 |
+
# Add similar queries if available
|
72 |
+
if results.get('similar_queries'):
|
73 |
+
formatted += "## 🔄 Related Searches\n"
|
74 |
+
for query in results['similar_queries']:
|
75 |
+
if isinstance(query, dict) and 'query' in query:
|
76 |
+
formatted += f"- {query['query']}\n"
|
77 |
+
elif isinstance(query, str):
|
78 |
+
formatted += f"- {query}\n"
|
79 |
+
|
80 |
+
return formatted
|
81 |
+
|
82 |
+
def create_demo():
|
83 |
+
"""Create the Gradio interface"""
|
84 |
+
|
85 |
+
# Create cache directory
|
86 |
+
os.makedirs(".cache", exist_ok=True)
|
87 |
+
|
88 |
+
demo = gr.Blocks(
|
89 |
+
title="AI-Powered Search Engine",
|
90 |
+
css="""
|
91 |
+
.gradio-container {max-width: 1200px !important}
|
92 |
+
.markdown-text {font-size: 16px !important}
|
93 |
+
"""
|
94 |
+
)
|
95 |
+
|
96 |
+
with demo:
|
97 |
+
gr.Markdown("""
|
98 |
+
# 🔍 Intelligent Web Search Engine
|
99 |
+
|
100 |
+
This advanced search engine uses AI to provide deep understanding of search results:
|
101 |
+
- 🧠 Multi-model AI analysis
|
102 |
+
- 📊 Semantic search and caching
|
103 |
+
- 💡 Automatic insights generation
|
104 |
+
- ❓ Smart follow-up questions
|
105 |
+
- 🔄 Related searches
|
106 |
+
""")
|
107 |
+
|
108 |
+
with gr.Row():
|
109 |
+
with gr.Column():
|
110 |
+
query = gr.Textbox(
|
111 |
+
label="Search Query",
|
112 |
+
placeholder="Enter your search query...",
|
113 |
+
lines=2
|
114 |
+
)
|
115 |
+
max_results = gr.Slider(
|
116 |
+
minimum=3,
|
117 |
+
maximum=10,
|
118 |
+
value=5,
|
119 |
+
step=1,
|
120 |
+
label="Maximum Results"
|
121 |
+
)
|
122 |
+
search_btn = gr.Button("🔍 Search", variant="primary")
|
123 |
+
|
124 |
+
with gr.Column():
|
125 |
+
output = gr.Markdown(
|
126 |
+
label="Results",
|
127 |
+
show_label=False
|
128 |
+
)
|
129 |
+
|
130 |
+
search_btn.click(
|
131 |
+
fn=safe_search,
|
132 |
+
inputs=[query, max_results],
|
133 |
+
outputs=output
|
134 |
+
)
|
135 |
+
|
136 |
+
gr.Examples(
|
137 |
+
examples=[
|
138 |
+
["What are the latest developments in quantum computing?", 5],
|
139 |
+
["How does Python's asyncio work? Show code examples", 5],
|
140 |
+
["Explain the transformer architecture in deep learning", 5],
|
141 |
+
["What are the environmental impacts of renewable energy?", 5]
|
142 |
+
],
|
143 |
+
inputs=[query, max_results],
|
144 |
+
outputs=output,
|
145 |
+
fn=safe_search,
|
146 |
+
cache_examples=True
|
147 |
+
)
|
148 |
+
|
149 |
+
return demo
|
150 |
+
|
151 |
+
# Create the demo
|
152 |
+
demo = create_demo()
|
153 |
+
|
154 |
+
# Launch for Spaces
|
155 |
+
demo.launch()
|
packages.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python3-dev
|
2 |
+
build-essential
|
3 |
+
git
|
4 |
+
libgomp1
|
rag_engine.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Dict, Any
|
2 |
+
import numpy as np
|
3 |
+
from langchain.vectorstores import FAISS
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
6 |
+
from search_engine import WebSearchEngine
|
7 |
+
import logging
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
class RAGEngine:
|
12 |
+
def __init__(self):
|
13 |
+
self.web_search = WebSearchEngine()
|
14 |
+
self.embeddings = HuggingFaceEmbeddings(
|
15 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
16 |
+
model_kwargs={"device": "cpu"}
|
17 |
+
)
|
18 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
19 |
+
chunk_size=500,
|
20 |
+
chunk_overlap=50
|
21 |
+
)
|
22 |
+
self.vector_store = None
|
23 |
+
|
24 |
+
def process_and_store_content(self, content: str, metadata: Dict[str, Any] = None) -> None:
|
25 |
+
"""Process content and store in vector store"""
|
26 |
+
try:
|
27 |
+
# Split content into chunks
|
28 |
+
texts = self.text_splitter.split_text(content)
|
29 |
+
|
30 |
+
# Create metadata for each chunk
|
31 |
+
metadatas = [metadata or {}] * len(texts)
|
32 |
+
|
33 |
+
# Initialize or update vector store
|
34 |
+
if self.vector_store is None:
|
35 |
+
self.vector_store = FAISS.from_texts(texts, self.embeddings, metadatas=metadatas)
|
36 |
+
else:
|
37 |
+
self.vector_store.add_texts(texts, metadatas=metadatas)
|
38 |
+
|
39 |
+
except Exception as e:
|
40 |
+
logger.error(f"Error processing content: {str(e)}")
|
41 |
+
raise
|
42 |
+
|
43 |
+
async def search_and_process(self, query: str, max_results: int = 5, similarity_k: int = 3) -> Dict:
|
44 |
+
"""Search the web and process results with RAG"""
|
45 |
+
try:
|
46 |
+
# Get web search results
|
47 |
+
web_results = self.web_search.search(query, max_results)
|
48 |
+
|
49 |
+
# Process and store new content
|
50 |
+
for result in web_results['results']:
|
51 |
+
if 'content' in result:
|
52 |
+
self.process_and_store_content(
|
53 |
+
result['content'],
|
54 |
+
metadata={'url': result.get('url'), 'title': result.get('title')}
|
55 |
+
)
|
56 |
+
|
57 |
+
# Perform similarity search
|
58 |
+
if self.vector_store:
|
59 |
+
similar_docs = self.vector_store.similarity_search_with_score(
|
60 |
+
query,
|
61 |
+
k=similarity_k
|
62 |
+
)
|
63 |
+
|
64 |
+
# Add similarity results
|
65 |
+
web_results['similar_chunks'] = [
|
66 |
+
{
|
67 |
+
'content': doc[0].page_content,
|
68 |
+
'metadata': doc[0].metadata,
|
69 |
+
'similarity_score': doc[1]
|
70 |
+
}
|
71 |
+
for doc in similar_docs
|
72 |
+
]
|
73 |
+
|
74 |
+
return web_results
|
75 |
+
|
76 |
+
except Exception as e:
|
77 |
+
logger.error(f"Error in search_and_process: {str(e)}")
|
78 |
+
raise
|
79 |
+
|
80 |
+
def get_relevant_context(self, query: str, k: int = 3) -> List[Dict]:
|
81 |
+
"""Get most relevant context from vector store"""
|
82 |
+
if not self.vector_store:
|
83 |
+
return []
|
84 |
+
|
85 |
+
similar_docs = self.vector_store.similarity_search_with_score(query, k=k)
|
86 |
+
return [
|
87 |
+
{
|
88 |
+
'content': doc[0].page_content,
|
89 |
+
'metadata': doc[0].metadata,
|
90 |
+
'similarity_score': doc[1]
|
91 |
+
}
|
92 |
+
for doc in similar_docs
|
93 |
+
]
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.14.0
|
2 |
+
requests>=2.31.0
|
3 |
+
beautifulsoup4>=4.12.2
|
4 |
+
transformers>=4.36.0
|
5 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
6 |
+
torch>=2.2.0+cpu
|
7 |
+
duckduckgo-search>=4.4.3
|
8 |
+
langchain>=0.1.0
|
9 |
+
sentence-transformers>=2.5.1
|
10 |
+
numpy>=1.26.0
|
11 |
+
tqdm>=4.66.0
|
12 |
+
lxml>=5.1.0
|
13 |
+
protobuf>=4.25.2
|
14 |
+
accelerate>=0.26.1
|
search_engine.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List, Any
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from duckduckgo_search import ddg
|
5 |
+
from transformers import pipeline
|
6 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
7 |
+
import time
|
8 |
+
import json
|
9 |
+
import os
|
10 |
+
from urllib.parse import urlparse
|
11 |
+
|
12 |
+
class ModelManager:
|
13 |
+
"""Manages different AI models for specific tasks"""
|
14 |
+
|
15 |
+
def __init__(self):
|
16 |
+
self.device = "cpu"
|
17 |
+
self.models = {}
|
18 |
+
self.load_models()
|
19 |
+
|
20 |
+
def load_models(self):
|
21 |
+
# Use smaller models for CPU deployment
|
22 |
+
self.models['summarizer'] = pipeline(
|
23 |
+
"summarization",
|
24 |
+
model="facebook/bart-base",
|
25 |
+
device=self.device
|
26 |
+
)
|
27 |
+
|
28 |
+
self.models['embeddings'] = HuggingFaceEmbeddings(
|
29 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
30 |
+
model_kwargs={"device": self.device}
|
31 |
+
)
|
32 |
+
|
33 |
+
class ContentProcessor:
|
34 |
+
"""Processes and analyzes different types of content"""
|
35 |
+
|
36 |
+
def __init__(self):
|
37 |
+
self.model_manager = ModelManager()
|
38 |
+
|
39 |
+
def process_content(self, content: str) -> Dict:
|
40 |
+
"""Process content and generate insights"""
|
41 |
+
try:
|
42 |
+
# Generate summary
|
43 |
+
summary = self.model_manager.models['summarizer'](
|
44 |
+
content[:1024],
|
45 |
+
max_length=100,
|
46 |
+
min_length=30,
|
47 |
+
do_sample=False
|
48 |
+
)[0]['summary_text']
|
49 |
+
|
50 |
+
return {
|
51 |
+
'summary': summary,
|
52 |
+
'content_type': 'text',
|
53 |
+
'explanation': summary
|
54 |
+
}
|
55 |
+
except Exception as e:
|
56 |
+
print(f"Error processing content: {str(e)}")
|
57 |
+
return {
|
58 |
+
'summary': content[:200] + "...",
|
59 |
+
'content_type': 'text',
|
60 |
+
'explanation': "Unable to generate detailed analysis."
|
61 |
+
}
|
62 |
+
|
63 |
+
class WebSearchEngine:
|
64 |
+
"""Main search engine class"""
|
65 |
+
|
66 |
+
def __init__(self):
|
67 |
+
self.processor = ContentProcessor()
|
68 |
+
self.session = requests.Session()
|
69 |
+
self.request_delay = 1.0
|
70 |
+
self.last_request_time = 0
|
71 |
+
|
72 |
+
def is_valid_url(self, url: str) -> bool:
|
73 |
+
"""Check if URL is valid for crawling"""
|
74 |
+
try:
|
75 |
+
parsed = urlparse(url)
|
76 |
+
return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
|
77 |
+
except:
|
78 |
+
return False
|
79 |
+
|
80 |
+
def get_metadata(self, soup: BeautifulSoup) -> Dict:
|
81 |
+
"""Extract metadata from page"""
|
82 |
+
title = soup.title.string if soup.title else ""
|
83 |
+
description = ""
|
84 |
+
if soup.find("meta", attrs={"name": "description"}):
|
85 |
+
description = soup.find("meta", attrs={"name": "description"}).get("content", "")
|
86 |
+
|
87 |
+
return {
|
88 |
+
"title": title,
|
89 |
+
"description": description
|
90 |
+
}
|
91 |
+
|
92 |
+
def process_url(self, url: str) -> Dict:
|
93 |
+
"""Process a single URL"""
|
94 |
+
try:
|
95 |
+
# Respect rate limiting
|
96 |
+
current_time = time.time()
|
97 |
+
if current_time - self.last_request_time < self.request_delay:
|
98 |
+
time.sleep(self.request_delay - (current_time - self.last_request_time))
|
99 |
+
|
100 |
+
response = self.session.get(url, timeout=10)
|
101 |
+
self.last_request_time = time.time()
|
102 |
+
|
103 |
+
if not response.ok:
|
104 |
+
return None
|
105 |
+
|
106 |
+
soup = BeautifulSoup(response.text, 'lxml')
|
107 |
+
metadata = self.get_metadata(soup)
|
108 |
+
|
109 |
+
# Extract main content
|
110 |
+
content = ' '.join([p.get_text() for p in soup.find_all('p')])
|
111 |
+
|
112 |
+
if not content:
|
113 |
+
return None
|
114 |
+
|
115 |
+
processed_content = self.processor.process_content(content)
|
116 |
+
processed_content['metadata'] = metadata
|
117 |
+
|
118 |
+
return {
|
119 |
+
'url': url,
|
120 |
+
'title': metadata['title'],
|
121 |
+
'snippet': content[:200] + "...",
|
122 |
+
'processed_content': processed_content
|
123 |
+
}
|
124 |
+
|
125 |
+
except Exception as e:
|
126 |
+
print(f"Error processing {url}: {str(e)}")
|
127 |
+
return None
|
128 |
+
|
129 |
+
def search(self, query: str, max_results: int = 5) -> Dict:
|
130 |
+
"""Perform search and process results"""
|
131 |
+
try:
|
132 |
+
# Search using DuckDuckGo
|
133 |
+
search_results = ddg(query, max_results=max_results)
|
134 |
+
|
135 |
+
# Process results
|
136 |
+
processed_results = []
|
137 |
+
for result in search_results:
|
138 |
+
if self.is_valid_url(result['link']):
|
139 |
+
processed = self.process_url(result['link'])
|
140 |
+
if processed:
|
141 |
+
processed_results.append(processed)
|
142 |
+
|
143 |
+
# Generate insights
|
144 |
+
all_content = ' '.join([r['processed_content']['summary'] for r in processed_results if r])
|
145 |
+
insights = self.processor.process_content(all_content)['summary']
|
146 |
+
|
147 |
+
# Generate follow-up questions
|
148 |
+
follow_up_questions = [
|
149 |
+
f"What are the key differences between {query} and related topics?",
|
150 |
+
f"How has {query} evolved over time?",
|
151 |
+
f"What are the practical applications of {query}?"
|
152 |
+
]
|
153 |
+
|
154 |
+
return {
|
155 |
+
'results': processed_results,
|
156 |
+
'insights': insights,
|
157 |
+
'follow_up_questions': follow_up_questions,
|
158 |
+
'similar_queries': []
|
159 |
+
}
|
160 |
+
|
161 |
+
except Exception as e:
|
162 |
+
print(f"Error during search: {str(e)}")
|
163 |
+
return {
|
164 |
+
'results': [],
|
165 |
+
'insights': f"Error performing search: {str(e)}",
|
166 |
+
'follow_up_questions': [],
|
167 |
+
'similar_queries': []
|
168 |
+
}
|
169 |
+
|
170 |
+
# Main search function
|
171 |
+
def search(query: str, max_results: int = 5) -> Dict:
|
172 |
+
"""Main search function"""
|
173 |
+
engine = WebSearchEngine()
|
174 |
+
return engine.search(query, max_results)
|
space.yml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
title: Intelligent Search Engine
|
2 |
+
emoji: 🔍
|
3 |
+
colorFrom: blue
|
4 |
+
colorTo: indigo
|
5 |
+
sdk: gradio
|
6 |
+
sdk_version: 4.14.0
|
7 |
+
python_version: "3.10"
|
8 |
+
app_file: app.py
|
9 |
+
app_port: 7860
|
10 |
+
pinned: false
|
11 |
+
license: apache-2.0
|