fikird commited on
Commit
44198e0
·
0 Parent(s):

Add RAG functionality with vector storage and web crawling

Browse files
Files changed (8) hide show
  1. .gitignore +41 -0
  2. README.md +80 -0
  3. app.py +155 -0
  4. packages.txt +4 -0
  5. rag_engine.py +93 -0
  6. requirements.txt +14 -0
  7. search_engine.py +174 -0
  8. space.yml +11 -0
.gitignore ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ env/
26
+ ENV/
27
+
28
+ # IDE
29
+ .idea/
30
+ .vscode/
31
+ *.swp
32
+ *.swo
33
+
34
+ # OS
35
+ .DS_Store
36
+ Thumbs.db
37
+
38
+ # Project specific
39
+ *.log
40
+ cache/
41
+ .env
README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🔍 Intelligent Web Search Engine
2
+
3
+ An advanced AI-powered search engine that provides deep understanding of web content, code analysis, and intelligent insights.
4
+
5
+ ## 🌟 Features
6
+
7
+ - Multi-model AI analysis
8
+ - Semantic search and caching
9
+ - Automatic insights generation
10
+ - Smart follow-up questions
11
+ - Code-aware analysis
12
+ - Related searches
13
+
14
+ ## 🚀 Deployment to Hugging Face Spaces
15
+
16
+ 1. Create a new Space:
17
+ - Go to [huggingface.co/spaces](https://huggingface.co/spaces)
18
+ - Click "Create new Space"
19
+ - Choose "Gradio" as the SDK
20
+ - Select "CPU" as the hardware
21
+ - Name your space (e.g., "intelligent-web-search")
22
+
23
+ 2. Upload Files:
24
+ - Upload all files from the `aiws` directory
25
+ - Make sure to include:
26
+ - `app.py`
27
+ - `search_engine.py`
28
+ - `requirements.txt`
29
+ - `packages.txt`
30
+
31
+ 3. Space Settings:
32
+ - Go to the "Settings" tab
33
+ - Under "Repository secrets", add any required API keys
34
+ - Under "Variables", set:
35
+ ```
36
+ PYTHON_PACKAGES_PATH=/home/user/.local/lib/python3.9/site-packages
37
+ ```
38
+
39
+ 4. The space will automatically build and deploy your app
40
+
41
+ ## 📦 Local Development
42
+
43
+ 1. Clone the repository:
44
+ ```bash
45
+ git clone [your-repo-url]
46
+ cd aiws
47
+ ```
48
+
49
+ 2. Install dependencies:
50
+ ```bash
51
+ pip install -r requirements.txt
52
+ ```
53
+
54
+ 3. Run the app:
55
+ ```bash
56
+ python app.py
57
+ ```
58
+
59
+ ## 🔧 Configuration
60
+
61
+ The search engine uses several AI models:
62
+ - Summarization: facebook/bart-base
63
+ - Code Understanding: Salesforce/codet5-small
64
+ - General QA: google/flan-t5-base
65
+ - Embeddings: sentence-transformers/all-MiniLM-L6-v2
66
+
67
+ ## 📝 Usage
68
+
69
+ 1. Enter your search query
70
+ 2. Adjust the maximum number of results (1-20)
71
+ 3. Click "Search"
72
+ 4. View results including:
73
+ - Key insights
74
+ - Follow-up questions
75
+ - Detailed analysis
76
+ - Related searches
77
+
78
+ ## 🤝 Contributing
79
+
80
+ Contributions are welcome! Please feel free to submit a Pull Request.
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from rag_engine import RAGEngine
3
+ import torch
4
+ import os
5
+ import logging
6
+ import traceback
7
+ import asyncio
8
+
9
+ # Configure logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(levelname)s - %(message)s'
13
+ )
14
+ logger = logging.getLogger(__name__)
15
+
16
+ def safe_search(query, max_results):
17
+ """Wrapper function to handle errors gracefully"""
18
+ try:
19
+ rag = RAGEngine()
20
+ results = asyncio.run(rag.search_and_process(query, max_results))
21
+ return format_results(results)
22
+ except Exception as e:
23
+ error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
24
+ logger.error(error_msg)
25
+ return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
26
+
27
+ def format_results(results):
28
+ """Format search results for display"""
29
+ if not results:
30
+ return "# ⚠️ No Results\nNo search results were found. Please try a different query."
31
+
32
+ formatted = f"# 🔍 Search Results\n\n"
33
+
34
+ # Add insights section
35
+ if 'insights' in results:
36
+ formatted += f"## 💡 Key Insights\n{results['insights']}\n\n"
37
+
38
+ # Add follow-up questions
39
+ if 'follow_up_questions' in results:
40
+ formatted += "## ❓ Follow-up Questions\n"
41
+ for q in results['follow_up_questions']:
42
+ if q and q.strip():
43
+ formatted += f"- {q.strip()}\n"
44
+ formatted += "\n"
45
+
46
+ # Add main results
47
+ if 'results' in results:
48
+ formatted += "## 📄 Detailed Results\n\n"
49
+ for i, result in enumerate(results['results'], 1):
50
+ formatted += f"### {i}. "
51
+ if 'url' in result:
52
+ formatted += f"[{result.get('title', 'Untitled')}]({result['url']})\n"
53
+ else:
54
+ formatted += f"{result.get('title', 'Untitled')}\n"
55
+
56
+ if result.get('processed_content'):
57
+ content = result['processed_content']
58
+ if 'summary' in content:
59
+ formatted += f"**Summary:** {content['summary']}\n\n"
60
+ if content.get('metadata', {}).get('description'):
61
+ formatted += f"**Description:** {content['metadata']['description']}\n\n"
62
+ if content.get('content_type') == 'code':
63
+ formatted += f"**Code Analysis:** {content.get('explanation', '')}\n\n"
64
+ else:
65
+ formatted += f"**Detailed Explanation:** {content.get('explanation', '')}\n\n"
66
+
67
+ if 'snippet' in result:
68
+ formatted += f"**Snippet:** {result['snippet']}\n\n"
69
+ formatted += "---\n\n"
70
+
71
+ # Add similar queries if available
72
+ if results.get('similar_queries'):
73
+ formatted += "## 🔄 Related Searches\n"
74
+ for query in results['similar_queries']:
75
+ if isinstance(query, dict) and 'query' in query:
76
+ formatted += f"- {query['query']}\n"
77
+ elif isinstance(query, str):
78
+ formatted += f"- {query}\n"
79
+
80
+ return formatted
81
+
82
+ def create_demo():
83
+ """Create the Gradio interface"""
84
+
85
+ # Create cache directory
86
+ os.makedirs(".cache", exist_ok=True)
87
+
88
+ demo = gr.Blocks(
89
+ title="AI-Powered Search Engine",
90
+ css="""
91
+ .gradio-container {max-width: 1200px !important}
92
+ .markdown-text {font-size: 16px !important}
93
+ """
94
+ )
95
+
96
+ with demo:
97
+ gr.Markdown("""
98
+ # 🔍 Intelligent Web Search Engine
99
+
100
+ This advanced search engine uses AI to provide deep understanding of search results:
101
+ - 🧠 Multi-model AI analysis
102
+ - 📊 Semantic search and caching
103
+ - 💡 Automatic insights generation
104
+ - ❓ Smart follow-up questions
105
+ - 🔄 Related searches
106
+ """)
107
+
108
+ with gr.Row():
109
+ with gr.Column():
110
+ query = gr.Textbox(
111
+ label="Search Query",
112
+ placeholder="Enter your search query...",
113
+ lines=2
114
+ )
115
+ max_results = gr.Slider(
116
+ minimum=3,
117
+ maximum=10,
118
+ value=5,
119
+ step=1,
120
+ label="Maximum Results"
121
+ )
122
+ search_btn = gr.Button("🔍 Search", variant="primary")
123
+
124
+ with gr.Column():
125
+ output = gr.Markdown(
126
+ label="Results",
127
+ show_label=False
128
+ )
129
+
130
+ search_btn.click(
131
+ fn=safe_search,
132
+ inputs=[query, max_results],
133
+ outputs=output
134
+ )
135
+
136
+ gr.Examples(
137
+ examples=[
138
+ ["What are the latest developments in quantum computing?", 5],
139
+ ["How does Python's asyncio work? Show code examples", 5],
140
+ ["Explain the transformer architecture in deep learning", 5],
141
+ ["What are the environmental impacts of renewable energy?", 5]
142
+ ],
143
+ inputs=[query, max_results],
144
+ outputs=output,
145
+ fn=safe_search,
146
+ cache_examples=True
147
+ )
148
+
149
+ return demo
150
+
151
+ # Create the demo
152
+ demo = create_demo()
153
+
154
+ # Launch for Spaces
155
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ python3-dev
2
+ build-essential
3
+ git
4
+ libgomp1
rag_engine.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any
2
+ import numpy as np
3
+ from langchain.vectorstores import FAISS
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from search_engine import WebSearchEngine
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class RAGEngine:
12
+ def __init__(self):
13
+ self.web_search = WebSearchEngine()
14
+ self.embeddings = HuggingFaceEmbeddings(
15
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
16
+ model_kwargs={"device": "cpu"}
17
+ )
18
+ self.text_splitter = RecursiveCharacterTextSplitter(
19
+ chunk_size=500,
20
+ chunk_overlap=50
21
+ )
22
+ self.vector_store = None
23
+
24
+ def process_and_store_content(self, content: str, metadata: Dict[str, Any] = None) -> None:
25
+ """Process content and store in vector store"""
26
+ try:
27
+ # Split content into chunks
28
+ texts = self.text_splitter.split_text(content)
29
+
30
+ # Create metadata for each chunk
31
+ metadatas = [metadata or {}] * len(texts)
32
+
33
+ # Initialize or update vector store
34
+ if self.vector_store is None:
35
+ self.vector_store = FAISS.from_texts(texts, self.embeddings, metadatas=metadatas)
36
+ else:
37
+ self.vector_store.add_texts(texts, metadatas=metadatas)
38
+
39
+ except Exception as e:
40
+ logger.error(f"Error processing content: {str(e)}")
41
+ raise
42
+
43
+ async def search_and_process(self, query: str, max_results: int = 5, similarity_k: int = 3) -> Dict:
44
+ """Search the web and process results with RAG"""
45
+ try:
46
+ # Get web search results
47
+ web_results = self.web_search.search(query, max_results)
48
+
49
+ # Process and store new content
50
+ for result in web_results['results']:
51
+ if 'content' in result:
52
+ self.process_and_store_content(
53
+ result['content'],
54
+ metadata={'url': result.get('url'), 'title': result.get('title')}
55
+ )
56
+
57
+ # Perform similarity search
58
+ if self.vector_store:
59
+ similar_docs = self.vector_store.similarity_search_with_score(
60
+ query,
61
+ k=similarity_k
62
+ )
63
+
64
+ # Add similarity results
65
+ web_results['similar_chunks'] = [
66
+ {
67
+ 'content': doc[0].page_content,
68
+ 'metadata': doc[0].metadata,
69
+ 'similarity_score': doc[1]
70
+ }
71
+ for doc in similar_docs
72
+ ]
73
+
74
+ return web_results
75
+
76
+ except Exception as e:
77
+ logger.error(f"Error in search_and_process: {str(e)}")
78
+ raise
79
+
80
+ def get_relevant_context(self, query: str, k: int = 3) -> List[Dict]:
81
+ """Get most relevant context from vector store"""
82
+ if not self.vector_store:
83
+ return []
84
+
85
+ similar_docs = self.vector_store.similarity_search_with_score(query, k=k)
86
+ return [
87
+ {
88
+ 'content': doc[0].page_content,
89
+ 'metadata': doc[0].metadata,
90
+ 'similarity_score': doc[1]
91
+ }
92
+ for doc in similar_docs
93
+ ]
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.14.0
2
+ requests>=2.31.0
3
+ beautifulsoup4>=4.12.2
4
+ transformers>=4.36.0
5
+ --extra-index-url https://download.pytorch.org/whl/cpu
6
+ torch>=2.2.0+cpu
7
+ duckduckgo-search>=4.4.3
8
+ langchain>=0.1.0
9
+ sentence-transformers>=2.5.1
10
+ numpy>=1.26.0
11
+ tqdm>=4.66.0
12
+ lxml>=5.1.0
13
+ protobuf>=4.25.2
14
+ accelerate>=0.26.1
search_engine.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from duckduckgo_search import ddg
5
+ from transformers import pipeline
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ import time
8
+ import json
9
+ import os
10
+ from urllib.parse import urlparse
11
+
12
+ class ModelManager:
13
+ """Manages different AI models for specific tasks"""
14
+
15
+ def __init__(self):
16
+ self.device = "cpu"
17
+ self.models = {}
18
+ self.load_models()
19
+
20
+ def load_models(self):
21
+ # Use smaller models for CPU deployment
22
+ self.models['summarizer'] = pipeline(
23
+ "summarization",
24
+ model="facebook/bart-base",
25
+ device=self.device
26
+ )
27
+
28
+ self.models['embeddings'] = HuggingFaceEmbeddings(
29
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
30
+ model_kwargs={"device": self.device}
31
+ )
32
+
33
+ class ContentProcessor:
34
+ """Processes and analyzes different types of content"""
35
+
36
+ def __init__(self):
37
+ self.model_manager = ModelManager()
38
+
39
+ def process_content(self, content: str) -> Dict:
40
+ """Process content and generate insights"""
41
+ try:
42
+ # Generate summary
43
+ summary = self.model_manager.models['summarizer'](
44
+ content[:1024],
45
+ max_length=100,
46
+ min_length=30,
47
+ do_sample=False
48
+ )[0]['summary_text']
49
+
50
+ return {
51
+ 'summary': summary,
52
+ 'content_type': 'text',
53
+ 'explanation': summary
54
+ }
55
+ except Exception as e:
56
+ print(f"Error processing content: {str(e)}")
57
+ return {
58
+ 'summary': content[:200] + "...",
59
+ 'content_type': 'text',
60
+ 'explanation': "Unable to generate detailed analysis."
61
+ }
62
+
63
+ class WebSearchEngine:
64
+ """Main search engine class"""
65
+
66
+ def __init__(self):
67
+ self.processor = ContentProcessor()
68
+ self.session = requests.Session()
69
+ self.request_delay = 1.0
70
+ self.last_request_time = 0
71
+
72
+ def is_valid_url(self, url: str) -> bool:
73
+ """Check if URL is valid for crawling"""
74
+ try:
75
+ parsed = urlparse(url)
76
+ return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
77
+ except:
78
+ return False
79
+
80
+ def get_metadata(self, soup: BeautifulSoup) -> Dict:
81
+ """Extract metadata from page"""
82
+ title = soup.title.string if soup.title else ""
83
+ description = ""
84
+ if soup.find("meta", attrs={"name": "description"}):
85
+ description = soup.find("meta", attrs={"name": "description"}).get("content", "")
86
+
87
+ return {
88
+ "title": title,
89
+ "description": description
90
+ }
91
+
92
+ def process_url(self, url: str) -> Dict:
93
+ """Process a single URL"""
94
+ try:
95
+ # Respect rate limiting
96
+ current_time = time.time()
97
+ if current_time - self.last_request_time < self.request_delay:
98
+ time.sleep(self.request_delay - (current_time - self.last_request_time))
99
+
100
+ response = self.session.get(url, timeout=10)
101
+ self.last_request_time = time.time()
102
+
103
+ if not response.ok:
104
+ return None
105
+
106
+ soup = BeautifulSoup(response.text, 'lxml')
107
+ metadata = self.get_metadata(soup)
108
+
109
+ # Extract main content
110
+ content = ' '.join([p.get_text() for p in soup.find_all('p')])
111
+
112
+ if not content:
113
+ return None
114
+
115
+ processed_content = self.processor.process_content(content)
116
+ processed_content['metadata'] = metadata
117
+
118
+ return {
119
+ 'url': url,
120
+ 'title': metadata['title'],
121
+ 'snippet': content[:200] + "...",
122
+ 'processed_content': processed_content
123
+ }
124
+
125
+ except Exception as e:
126
+ print(f"Error processing {url}: {str(e)}")
127
+ return None
128
+
129
+ def search(self, query: str, max_results: int = 5) -> Dict:
130
+ """Perform search and process results"""
131
+ try:
132
+ # Search using DuckDuckGo
133
+ search_results = ddg(query, max_results=max_results)
134
+
135
+ # Process results
136
+ processed_results = []
137
+ for result in search_results:
138
+ if self.is_valid_url(result['link']):
139
+ processed = self.process_url(result['link'])
140
+ if processed:
141
+ processed_results.append(processed)
142
+
143
+ # Generate insights
144
+ all_content = ' '.join([r['processed_content']['summary'] for r in processed_results if r])
145
+ insights = self.processor.process_content(all_content)['summary']
146
+
147
+ # Generate follow-up questions
148
+ follow_up_questions = [
149
+ f"What are the key differences between {query} and related topics?",
150
+ f"How has {query} evolved over time?",
151
+ f"What are the practical applications of {query}?"
152
+ ]
153
+
154
+ return {
155
+ 'results': processed_results,
156
+ 'insights': insights,
157
+ 'follow_up_questions': follow_up_questions,
158
+ 'similar_queries': []
159
+ }
160
+
161
+ except Exception as e:
162
+ print(f"Error during search: {str(e)}")
163
+ return {
164
+ 'results': [],
165
+ 'insights': f"Error performing search: {str(e)}",
166
+ 'follow_up_questions': [],
167
+ 'similar_queries': []
168
+ }
169
+
170
+ # Main search function
171
+ def search(query: str, max_results: int = 5) -> Dict:
172
+ """Main search function"""
173
+ engine = WebSearchEngine()
174
+ return engine.search(query, max_results)
space.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: Intelligent Search Engine
2
+ emoji: 🔍
3
+ colorFrom: blue
4
+ colorTo: indigo
5
+ sdk: gradio
6
+ sdk_version: 4.14.0
7
+ python_version: "3.10"
8
+ app_file: app.py
9
+ app_port: 7860
10
+ pinned: false
11
+ license: apache-2.0