Spaces:
Running
Running
# app.py | |
import gradio as gr | |
from bs4 import BeautifulSoup | |
import requests | |
from transformers import pipeline | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
import pandas as pd | |
# Initialize models and variables | |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6") # Using a smaller model for resource efficiency | |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
faiss_index = None | |
bookmarks = [] | |
fetch_cache = {} | |
def parse_bookmarks(file_content): | |
soup = BeautifulSoup(file_content, 'html.parser') | |
extracted_bookmarks = [] | |
for link in soup.find_all('a'): | |
url = link.get('href') | |
title = link.text | |
if url and title: | |
extracted_bookmarks.append({'url': url, 'title': title}) | |
return extracted_bookmarks | |
def fetch_url_info(bookmark): | |
url = bookmark['url'] | |
if url in fetch_cache: | |
bookmark.update(fetch_cache[url]) | |
return bookmark | |
try: | |
response = requests.get(url, timeout=5) | |
bookmark['etag'] = response.headers.get('ETag', 'N/A') | |
bookmark['status_code'] = response.status_code | |
if response.status_code >= 400: | |
bookmark['dead_link'] = True | |
bookmark['content'] = '' | |
else: | |
bookmark['dead_link'] = False | |
soup = BeautifulSoup(response.content, 'html.parser') | |
meta_tags = {meta.get('name', ''): meta.get('content', '') for meta in soup.find_all('meta')} | |
bookmark['meta_tags'] = meta_tags | |
bookmark['content'] = soup.get_text(separator=' ', strip=True) | |
except Exception as e: | |
bookmark['dead_link'] = True | |
bookmark['etag'] = 'N/A' | |
bookmark['status_code'] = 'N/A' | |
bookmark['meta_tags'] = {} | |
bookmark['content'] = '' | |
finally: | |
fetch_cache[url] = { | |
'etag': bookmark.get('etag'), | |
'status_code': bookmark.get('status_code'), | |
'dead_link': bookmark.get('dead_link'), | |
'meta_tags': bookmark.get('meta_tags'), | |
'content': bookmark.get('content'), | |
} | |
return bookmark | |
def generate_summary(bookmark): | |
content = bookmark.get('content', '') | |
if content: | |
# Limit content to first 500 characters to save resources | |
content = content[:500] | |
summary = summarizer(content, max_length=50, min_length=25, do_sample=False) | |
bookmark['summary'] = summary[0]['summary_text'] | |
else: | |
bookmark['summary'] = 'No content available to summarize.' | |
return bookmark | |
def vectorize_and_index(bookmarks): | |
summaries = [bookmark['summary'] for bookmark in bookmarks] | |
embeddings = embedding_model.encode(summaries) | |
dimension = embeddings.shape[1] | |
faiss_idx = faiss.IndexFlatL2(dimension) | |
faiss_idx.add(np.array(embeddings)) | |
return faiss_idx, embeddings | |
def display_bookmarks(): | |
data = [] | |
for i, bookmark in enumerate(bookmarks): | |
status = "Dead Link" if bookmark.get('dead_link') else "Active" | |
css_class = "dead-link" if bookmark.get('dead_link') else "" | |
data.append({ | |
'Index': i, | |
'Title': bookmark['title'], | |
'URL': f"<a href='{bookmark['url']}' target='_blank'>{bookmark['url']}</a>", | |
'Status': status, | |
'ETag': bookmark.get('etag', 'N/A'), | |
'Summary': bookmark.get('summary', ''), | |
'css_class': css_class | |
}) | |
df = pd.DataFrame(data) | |
return df | |
def process_uploaded_file(file): | |
global bookmarks, faiss_index | |
if file is None: | |
return "Please upload a bookmarks HTML file.", pd.DataFrame() | |
try: | |
# Decode the binary data to a string | |
file_content = file.decode('utf-8') | |
except UnicodeDecodeError: | |
return "Error decoding the file. Please ensure it's a valid HTML file.", pd.DataFrame() | |
bookmarks = parse_bookmarks(file_content) | |
if not bookmarks: | |
return "No bookmarks found in the uploaded file.", pd.DataFrame() | |
for bookmark in bookmarks: | |
fetch_url_info(bookmark) | |
generate_summary(bookmark) | |
faiss_index, embeddings = vectorize_and_index(bookmarks) | |
message = f"Successfully processed {len(bookmarks)} bookmarks." | |
bookmark_df = display_bookmarks() | |
return message, bookmark_df | |
def chatbot_response(user_query): | |
if faiss_index is None or not bookmarks: | |
return "No bookmarks available. Please upload and process your bookmarks first." | |
# Vectorize user query | |
user_embedding = embedding_model.encode([user_query]) | |
D, I = faiss_index.search(np.array(user_embedding), k=5) # Retrieve top 5 matches | |
# Generate response | |
response = "" | |
for idx in I[0]: | |
if idx < len(bookmarks): | |
bookmark = bookmarks[idx] | |
response += f"Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}\n\n" | |
return response.strip() | |
def edit_bookmark(bookmark_idx, new_title, new_url): | |
global faiss_index | |
try: | |
bookmark_idx = int(bookmark_idx) | |
if bookmark_idx < 0 or bookmark_idx >= len(bookmarks): | |
return "Invalid bookmark index.", display_bookmarks() | |
bookmarks[bookmark_idx]['title'] = new_title | |
bookmarks[bookmark_idx]['url'] = new_url | |
fetch_url_info(bookmarks[bookmark_idx]) | |
generate_summary(bookmarks[bookmark_idx]) | |
# Rebuild the FAISS index | |
faiss_index, embeddings = vectorize_and_index(bookmarks) | |
message = "Bookmark updated successfully." | |
updated_df = display_bookmarks() | |
return message, updated_df | |
except Exception as e: | |
return f"Error: {str(e)}", display_bookmarks() | |
def delete_bookmark(bookmark_idx): | |
global faiss_index | |
try: | |
bookmark_idx = int(bookmark_idx) | |
if bookmark_idx < 0 or bookmark_idx >= len(bookmarks): | |
return "Invalid bookmark index.", display_bookmarks() | |
bookmarks.pop(bookmark_idx) | |
# Rebuild the FAISS index | |
if bookmarks: | |
faiss_index, embeddings = vectorize_and_index(bookmarks) | |
else: | |
faiss_index = None | |
message = "Bookmark deleted successfully." | |
updated_df = display_bookmarks() | |
return message, updated_df | |
except Exception as e: | |
return f"Error: {str(e)}", display_bookmarks() | |
def build_app(): | |
with gr.Blocks(css="app.css") as demo: | |
gr.Markdown("<h1 style='text-align: center;'>Bookmark Manager App</h1>") | |
with gr.Tab("Upload and Process Bookmarks"): | |
upload = gr.File(label="Upload Bookmarks HTML File", type='binary') | |
process_button = gr.Button("Process Bookmarks") | |
output_text = gr.Textbox(label="Output") | |
bookmark_table = gr.HTML(label="Bookmarks") | |
def update_bookmark_table(file): | |
message, df = process_uploaded_file(file) | |
html_table = df.to_html(escape=False, index=False) | |
return message, html_table | |
process_button.click( | |
update_bookmark_table, | |
inputs=upload, | |
outputs=[output_text, bookmark_table] | |
) | |
with gr.Tab("Chat with Bookmarks"): | |
user_input = gr.Textbox(label="Ask about your bookmarks") | |
chat_output = gr.Textbox(label="Chatbot Response") | |
chat_button = gr.Button("Send") | |
chat_button.click( | |
chatbot_response, | |
inputs=user_input, | |
outputs=chat_output | |
) | |
with gr.Tab("Manage Bookmarks"): | |
manage_output = gr.Textbox(label="Manage Output") | |
bookmark_table_manage = gr.HTML(label="Bookmarks") | |
refresh_button = gr.Button("Refresh Bookmark List") | |
with gr.Row(): | |
index_input = gr.Number(label="Bookmark Index") | |
new_title_input = gr.Textbox(label="New Title") | |
new_url_input = gr.Textbox(label="New URL") | |
edit_button = gr.Button("Edit Bookmark") | |
delete_button = gr.Button("Delete Bookmark") | |
def update_manage_table(): | |
df = display_bookmarks() | |
html_table = df.to_html(escape=False, index=False) | |
return html_table | |
refresh_button.click( | |
update_manage_table, | |
inputs=None, | |
outputs=bookmark_table_manage | |
) | |
edit_button.click( | |
edit_bookmark, | |
inputs=[index_input, new_title_input, new_url_input], | |
outputs=[manage_output, bookmark_table_manage] | |
) | |
delete_button.click( | |
delete_bookmark, | |
inputs=index_input, | |
outputs=[manage_output, bookmark_table_manage] | |
) | |
# Initial load of the bookmarks table | |
bookmark_table_manage.value = update_manage_table() | |
demo.launch() | |
if __name__ == "__main__": | |
build_app() | |