import gradio as gr import advertools as adv import pandas as pd import re from secrets import token_hex import logging import os from markitdown import MarkItDown from typing import Tuple, List, Optional import validators # Set up logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) # Initialize MarkItDown md_converter = MarkItDown() def validate_url(url: str) -> Tuple[bool, str]: """Validate URL format and accessibility.""" if not url: return False, "URL is required" if not url.startswith(("http://", "https://")): url = "https://" + url if not validators.url(url): return False, "Invalid URL format" return True, url def safe_crawl(url: str, output_file: str) -> bool: """Safely perform a web crawl with timeout and error handling.""" try: adv.crawl( url, output_file, follow_links=False, custom_settings={ "CLOSESPIDER_TIMEOUT": 30, "ROBOTSTXT_OBEY": True, "CONCURRENT_REQUESTS_PER_DOMAIN": 1, "USER_AGENT": "Mozilla/5.0 (compatible; LLMContentBot/1.0)", "DOWNLOAD_TIMEOUT": 10, }, ) return True except Exception as e: logger.error(f"Crawl error for {url}: {str(e)}") return False def clean_text(text: str) -> str: """Clean and format text by removing extra whitespace and normalizing spacing.""" if not text: return "" # Remove extra whitespace and newlines text = re.sub(r"[\n\s]+", " ", text) # Split camelCase words text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text) # Clean extra spaces text = " ".join(text.split()) return text.strip() def process_link_pair(url: str, text: str, seen_links: set) -> Optional[str]: """Process a single link-text pair and return markdown if valid.""" if not url or not text: return None url = url.strip() text = clean_text(text) if not text or not url or url in seen_links: return None seen_links.add(url) return f"## {text}\n[{text}]({url})" def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str: """Process links based on selected types with deduplication.""" try: all_links = [] seen_links = set() # Track unique URLs if "All links" in link_types or not link_types: link_df = adv.crawlytics.links(crawl_df) for link, text in link_df[["link", "text"]].dropna().values: if md_link := process_link_pair(link, text, seen_links): all_links.append(md_link) else: for link_type in link_types: type_match = re.findall(r"header|footer|nav", link_type.lower()) if type_match: col_prefix = type_match[0] urls = crawl_df[f"{col_prefix}_links_url"].iloc[0] texts = crawl_df[f"{col_prefix}_links_text"].iloc[0] if urls and texts: urls = urls.split("@@") texts = texts.split("@@") for url, text in zip(urls, texts): if md_link := process_link_pair(url, text, seen_links): all_links.append(md_link) return "\n\n".join(all_links) except Exception as e: logger.error(f"Link processing error: {str(e)}") return "" def process_url(url: str, link_types: List[str]) -> Tuple[str, str]: """Process website URL and generate markdown content.""" valid, result = validate_url(url) if not valid: return "", result url = result output_file = f"crawl_{token_hex(6)}.jsonl" try: if not safe_crawl(url, output_file): return "", "Crawl failed or timed out" crawl_df = pd.read_json(output_file, lines=True) if crawl_df.empty: return "", "No data found for the URL" # Extract and clean title and description title = ( clean_text(crawl_df["title"].iloc[0]) if "title" in crawl_df.columns else "Untitled" ) meta_desc = ( clean_text(crawl_df["meta_desc"].iloc[0]) if "meta_desc" in crawl_df.columns else "" ) # Process links links_content = process_links(crawl_df, link_types) # Generate final markdown content = f"# {title}\n\n" if meta_desc: content += f"> {meta_desc}\n\n" content += links_content return content, f"Successfully processed {url}" except Exception as e: logger.error(f"Error processing {url}: {str(e)}") return "", f"Error: {str(e)}" finally: if os.path.exists(output_file): os.remove(output_file) def process_file(file: gr.File) -> Tuple[str, str]: """Convert uploaded file to markdown.""" if not file: return "", "No file uploaded" supported_extensions = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt"} file_ext = os.path.splitext(file.name)[1].lower() if file_ext not in supported_extensions: return "", f"Unsupported file type: {file_ext}" try: result = md_converter.convert(file.name) return result.text_content, "File processed successfully" except Exception as e: logger.error(f"File processing error: {str(e)}") return "", f"Error processing file: {str(e)}" # Custom CSS for styling css = """ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap'); body { font-family: 'Open Sans', sans-serif !important; } .primary-btn { background-color: #3452db !important; } .primary-btn:hover { background-color: #2a41af !important; } """ # Create a custom theme theme = gr.themes.Soft( primary_hue=gr.themes.colors.Color( name="blue", c50="#eef1ff", c100="#e0e5ff", c200="#c3cbff", c300="#a5b2ff", c400="#8798ff", c500="#6a7eff", c600="#3452db", c700="#2a41af", c800="#1f3183", c900="#152156", c950="#0a102b", ) ) # Create interface with gr.Blocks( theme=gr.themes.Soft(), css=css, head=""" """, ) as iface: gr.Markdown("# LLMs.txt Generator") with gr.Tab("Website URL"): url_input = gr.Textbox(label="Website URL", placeholder="example.com") link_types = gr.Dropdown( choices=["All links", "
links", "