Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

44198e0

0 Parent(s):

Add RAG functionality with vector storage and web crawling

Browse files

Files changed (8) hide show

.gitignore +41 -0
README.md +80 -0
app.py +155 -0
packages.txt +4 -0
rag_engine.py +93 -0
requirements.txt +14 -0
search_engine.py +174 -0
space.yml +11 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,41 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+env/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Project specific
+*.log
+cache/
+.env

README.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# 🔍 Intelligent Web Search Engine
+An advanced AI-powered search engine that provides deep understanding of web content, code analysis, and intelligent insights.
+## 🌟 Features
+- Multi-model AI analysis
+- Semantic search and caching
+- Automatic insights generation
+- Smart follow-up questions
+- Code-aware analysis
+- Related searches
+## 🚀 Deployment to Hugging Face Spaces
+1. Create a new Space:
+   - Go to [huggingface.co/spaces](https://huggingface.co/spaces)
+   - Click "Create new Space"
+   - Choose "Gradio" as the SDK
+   - Select "CPU" as the hardware
+   - Name your space (e.g., "intelligent-web-search")
+2. Upload Files:
+   - Upload all files from the `aiws` directory
+   - Make sure to include:
+     - `app.py`
+     - `search_engine.py`
+     - `requirements.txt`
+     - `packages.txt`
+3. Space Settings:
+   - Go to the "Settings" tab
+   - Under "Repository secrets", add any required API keys
+   - Under "Variables", set:
+     ```
+     PYTHON_PACKAGES_PATH=/home/user/.local/lib/python3.9/site-packages
+     ```
+4. The space will automatically build and deploy your app
+## 📦 Local Development
+1. Clone the repository:
+```bash
+git clone [your-repo-url]
+cd aiws
+```
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+3. Run the app:
+```bash
+python app.py
+```
+## 🔧 Configuration
+The search engine uses several AI models:
+- Summarization: facebook/bart-base
+- Code Understanding: Salesforce/codet5-small
+- General QA: google/flan-t5-base
+- Embeddings: sentence-transformers/all-MiniLM-L6-v2
+## 📝 Usage
+1. Enter your search query
+2. Adjust the maximum number of results (1-20)
+3. Click "Search"
+4. View results including:
+   - Key insights
+   - Follow-up questions
+   - Detailed analysis
+   - Related searches
+## 🤝 Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.

app.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import gradio as gr
+from rag_engine import RAGEngine
+import torch
+import os
+import logging
+import traceback
+import asyncio
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def safe_search(query, max_results):
+    """Wrapper function to handle errors gracefully"""
+    try:
+        rag = RAGEngine()
+        results = asyncio.run(rag.search_and_process(query, max_results))
+        return format_results(results)
+    except Exception as e:
+        error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+        logger.error(error_msg)
+        return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
+def format_results(results):
+    """Format search results for display"""
+    if not results:
+        return "# ⚠️ No Results\nNo search results were found. Please try a different query."
+    formatted = f"# 🔍 Search Results\n\n"
+    # Add insights section
+    if 'insights' in results:
+        formatted += f"## 💡 Key Insights\n{results['insights']}\n\n"
+    # Add follow-up questions
+    if 'follow_up_questions' in results:
+        formatted += "## ❓ Follow-up Questions\n"
+        for q in results['follow_up_questions']:
+            if q and q.strip():
+                formatted += f"- {q.strip()}\n"
+        formatted += "\n"
+    # Add main results
+    if 'results' in results:
+        formatted += "## 📄 Detailed Results\n\n"
+        for i, result in enumerate(results['results'], 1):
+            formatted += f"### {i}. "
+            if 'url' in result:
+                formatted += f"[{result.get('title', 'Untitled')}]({result['url']})\n"
+            else:
+                formatted += f"{result.get('title', 'Untitled')}\n"
+            if result.get('processed_content'):
+                content = result['processed_content']
+                if 'summary' in content:
+                    formatted += f"**Summary:** {content['summary']}\n\n"
+                if content.get('metadata', {}).get('description'):
+                    formatted += f"**Description:** {content['metadata']['description']}\n\n"
+                if content.get('content_type') == 'code':
+                    formatted += f"**Code Analysis:** {content.get('explanation', '')}\n\n"
+                else:
+                    formatted += f"**Detailed Explanation:** {content.get('explanation', '')}\n\n"
+            if 'snippet' in result:
+                formatted += f"**Snippet:** {result['snippet']}\n\n"
+            formatted += "---\n\n"
+    # Add similar queries if available
+    if results.get('similar_queries'):
+        formatted += "## 🔄 Related Searches\n"
+        for query in results['similar_queries']:
+            if isinstance(query, dict) and 'query' in query:
+                formatted += f"- {query['query']}\n"
+            elif isinstance(query, str):
+                formatted += f"- {query}\n"
+    return formatted
+def create_demo():
+    """Create the Gradio interface"""
+    # Create cache directory
+    os.makedirs(".cache", exist_ok=True)
+    demo = gr.Blocks(
+        title="AI-Powered Search Engine",
+        css="""
+        .gradio-container {max-width: 1200px !important}
+        .markdown-text {font-size: 16px !important}
+        """
+    )
+    with demo:
+        gr.Markdown("""
+        # 🔍 Intelligent Web Search Engine
+        This advanced search engine uses AI to provide deep understanding of search results:
+        - 🧠 Multi-model AI analysis
+        - 📊 Semantic search and caching
+        - 💡 Automatic insights generation
+        - ❓ Smart follow-up questions
+        - 🔄 Related searches
+        """)
+        with gr.Row():
+            with gr.Column():
+                query = gr.Textbox(
+                    label="Search Query",
+                    placeholder="Enter your search query...",
+                    lines=2
+                )
+                max_results = gr.Slider(
+                    minimum=3,
+                    maximum=10,
+                    value=5,
+                    step=1,
+                    label="Maximum Results"
+                )
+                search_btn = gr.Button("🔍 Search", variant="primary")
+        with gr.Column():
+            output = gr.Markdown(
+                label="Results",
+                show_label=False
+            )
+        search_btn.click(
+            fn=safe_search,
+            inputs=[query, max_results],
+            outputs=output
+        )
+        gr.Examples(
+            examples=[
+                ["What are the latest developments in quantum computing?", 5],
+                ["How does Python's asyncio work? Show code examples", 5],
+                ["Explain the transformer architecture in deep learning", 5],
+                ["What are the environmental impacts of renewable energy?", 5]
+            ],
+            inputs=[query, max_results],
+            outputs=output,
+            fn=safe_search,
+            cache_examples=True
+        )
+    return demo
+# Create the demo
+demo = create_demo()
+# Launch for Spaces
+demo.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+python3-dev
+build-essential
+git
+libgomp1

rag_engine.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from typing import List, Dict, Any
+import numpy as np
+from langchain.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from search_engine import WebSearchEngine
+import logging
+logger = logging.getLogger(__name__)
+class RAGEngine:
+    def __init__(self):
+        self.web_search = WebSearchEngine()
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={"device": "cpu"}
+        )
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=500,
+            chunk_overlap=50
+        )
+        self.vector_store = None
+    def process_and_store_content(self, content: str, metadata: Dict[str, Any] = None) -> None:
+        """Process content and store in vector store"""
+        try:
+            # Split content into chunks
+            texts = self.text_splitter.split_text(content)
+            # Create metadata for each chunk
+            metadatas = [metadata or {}] * len(texts)
+            # Initialize or update vector store
+            if self.vector_store is None:
+                self.vector_store = FAISS.from_texts(texts, self.embeddings, metadatas=metadatas)
+            else:
+                self.vector_store.add_texts(texts, metadatas=metadatas)
+        except Exception as e:
+            logger.error(f"Error processing content: {str(e)}")
+            raise
+    async def search_and_process(self, query: str, max_results: int = 5, similarity_k: int = 3) -> Dict:
+        """Search the web and process results with RAG"""
+        try:
+            # Get web search results
+            web_results = self.web_search.search(query, max_results)
+            # Process and store new content
+            for result in web_results['results']:
+                if 'content' in result:
+                    self.process_and_store_content(
+                        result['content'],
+                        metadata={'url': result.get('url'), 'title': result.get('title')}
+                    )
+            # Perform similarity search
+            if self.vector_store:
+                similar_docs = self.vector_store.similarity_search_with_score(
+                    query,
+                    k=similarity_k
+                )
+                # Add similarity results
+                web_results['similar_chunks'] = [
+                    {
+                        'content': doc[0].page_content,
+                        'metadata': doc[0].metadata,
+                        'similarity_score': doc[1]
+                    }
+                    for doc in similar_docs
+                ]
+            return web_results
+        except Exception as e:
+            logger.error(f"Error in search_and_process: {str(e)}")
+            raise
+    def get_relevant_context(self, query: str, k: int = 3) -> List[Dict]:
+        """Get most relevant context from vector store"""
+        if not self.vector_store:
+            return []
+        similar_docs = self.vector_store.similarity_search_with_score(query, k=k)
+        return [
+            {
+                'content': doc[0].page_content,
+                'metadata': doc[0].metadata,
+                'similarity_score': doc[1]
+            }
+            for doc in similar_docs
+        ]

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+gradio>=4.14.0
+requests>=2.31.0
+beautifulsoup4>=4.12.2
+transformers>=4.36.0
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch>=2.2.0+cpu
+duckduckgo-search>=4.4.3
+langchain>=0.1.0
+sentence-transformers>=2.5.1
+numpy>=1.26.0
+tqdm>=4.66.0
+lxml>=5.1.0
+protobuf>=4.25.2
+accelerate>=0.26.1

search_engine.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from typing import Dict, List, Any
+import requests
+from bs4 import BeautifulSoup
+from duckduckgo_search import ddg
+from transformers import pipeline
+from langchain.embeddings import HuggingFaceEmbeddings
+import time
+import json
+import os
+from urllib.parse import urlparse
+class ModelManager:
+    """Manages different AI models for specific tasks"""
+    def __init__(self):
+        self.device = "cpu"
+        self.models = {}
+        self.load_models()
+    def load_models(self):
+        # Use smaller models for CPU deployment
+        self.models['summarizer'] = pipeline(
+            "summarization",
+            model="facebook/bart-base",
+            device=self.device
+        )
+        self.models['embeddings'] = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={"device": self.device}
+        )
+class ContentProcessor:
+    """Processes and analyzes different types of content"""
+    def __init__(self):
+        self.model_manager = ModelManager()
+    def process_content(self, content: str) -> Dict:
+        """Process content and generate insights"""
+        try:
+            # Generate summary
+            summary = self.model_manager.models['summarizer'](
+                content[:1024],
+                max_length=100,
+                min_length=30,
+                do_sample=False
+            )[0]['summary_text']
+            return {
+                'summary': summary,
+                'content_type': 'text',
+                'explanation': summary
+            }
+        except Exception as e:
+            print(f"Error processing content: {str(e)}")
+            return {
+                'summary': content[:200] + "...",
+                'content_type': 'text',
+                'explanation': "Unable to generate detailed analysis."
+            }
+class WebSearchEngine:
+    """Main search engine class"""
+    def __init__(self):
+        self.processor = ContentProcessor()
+        self.session = requests.Session()
+        self.request_delay = 1.0
+        self.last_request_time = 0
+    def is_valid_url(self, url: str) -> bool:
+        """Check if URL is valid for crawling"""
+        try:
+            parsed = urlparse(url)
+            return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
+        except:
+            return False
+    def get_metadata(self, soup: BeautifulSoup) -> Dict:
+        """Extract metadata from page"""
+        title = soup.title.string if soup.title else ""
+        description = ""
+        if soup.find("meta", attrs={"name": "description"}):
+            description = soup.find("meta", attrs={"name": "description"}).get("content", "")
+        return {
+            "title": title,
+            "description": description
+        }
+    def process_url(self, url: str) -> Dict:
+        """Process a single URL"""
+        try:
+            # Respect rate limiting
+            current_time = time.time()
+            if current_time - self.last_request_time < self.request_delay:
+                time.sleep(self.request_delay - (current_time - self.last_request_time))
+            response = self.session.get(url, timeout=10)
+            self.last_request_time = time.time()
+            if not response.ok:
+                return None
+            soup = BeautifulSoup(response.text, 'lxml')
+            metadata = self.get_metadata(soup)
+            # Extract main content
+            content = ' '.join([p.get_text() for p in soup.find_all('p')])
+            if not content:
+                return None
+            processed_content = self.processor.process_content(content)
+            processed_content['metadata'] = metadata
+            return {
+                'url': url,
+                'title': metadata['title'],
+                'snippet': content[:200] + "...",
+                'processed_content': processed_content
+            }
+        except Exception as e:
+            print(f"Error processing {url}: {str(e)}")
+            return None
+    def search(self, query: str, max_results: int = 5) -> Dict:
+        """Perform search and process results"""
+        try:
+            # Search using DuckDuckGo
+            search_results = ddg(query, max_results=max_results)
+            # Process results
+            processed_results = []
+            for result in search_results:
+                if self.is_valid_url(result['link']):
+                    processed = self.process_url(result['link'])
+                    if processed:
+                        processed_results.append(processed)
+            # Generate insights
+            all_content = ' '.join([r['processed_content']['summary'] for r in processed_results if r])
+            insights = self.processor.process_content(all_content)['summary']
+            # Generate follow-up questions
+            follow_up_questions = [
+                f"What are the key differences between {query} and related topics?",
+                f"How has {query} evolved over time?",
+                f"What are the practical applications of {query}?"
+            ]
+            return {
+                'results': processed_results,
+                'insights': insights,
+                'follow_up_questions': follow_up_questions,
+                'similar_queries': []
+            }
+        except Exception as e:
+            print(f"Error during search: {str(e)}")
+            return {
+                'results': [],
+                'insights': f"Error performing search: {str(e)}",
+                'follow_up_questions': [],
+                'similar_queries': []
+            }
+# Main search function
+def search(query: str, max_results: int = 5) -> Dict:
+    """Main search function"""
+    engine = WebSearchEngine()
+    return engine.search(query, max_results)

space.yml ADDED Viewed

	@@ -0,0 +1,11 @@

+title: Intelligent Search Engine
+emoji: 🔍
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.14.0
+python_version: "3.10"
+app_file: app.py
+app_port: 7860
+pinned: false
+license: apache-2.0