# -*- coding: utf-8 -*- import re import json from itertools import chain import numpy as np from gensim.models import Word2Vec from tqdm import tqdm import faiss import gradio as gr from huggingface_hub import hf_hub_download # --- CONFIGURATION --- target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"] chunk_size = 5 # --- LOAD REDDIT COMMENTS --- def load_reddit_split(subreddit_name): path = hf_hub_download(repo_id="HuggingFaceGECLM/REDDIT_comments", filename=f"{subreddit_name}.jsonl") with open(path, "r") as f: for line in f: yield json.loads(line) combined_dataset = list(chain(*(load_reddit_split(sub) for sub in target_subreddits))) # --- CLEAN + CHUNK --- def clean_text(text): text = text.lower() text = re.sub(r"http\S+|www\S+|https\S+", "", text) text = re.sub(r"[^a-zA-Z\s]", "", text) return re.sub(r"\s+", " ", text).strip() cleaned_comments = [clean_text(comment["body"]) for comment in combined_dataset if "body" in comment] chunked_comments = [" ".join(cleaned_comments[i:i+chunk_size]) for i in range(0, len(cleaned_comments), chunk_size)] subreddit_labels = [ex["subreddit_name_prefixed"] for ex in combined_dataset[:len(chunked_comments)]] # --- TOKENIZE --- tokenized_chunks = [chunk.split() for chunk in chunked_comments] # --- TRAIN WORD2VEC --- model = Word2Vec(sentences=tokenized_chunks, vector_size=100, window=5, min_count=2, workers=4, sg=1) model.save("reddit_word2vec.model") # --- EMBEDDINGS + FAISS --- def embed_tokens(tokens, model): vectors = [model.wv[token] for token in tokens if token in model.wv] return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size) embeddings = np.array([embed_tokens(chunk, model) for chunk in tokenized_chunks]).astype("float32") index = faiss.IndexFlatL2(model.vector_size) index.add(embeddings) faiss.write_index(index, "reddit_faiss.index") # --- SEARCH LOGIC --- model = Word2Vec.load("reddit_word2vec.model") index = faiss.read_index("reddit_faiss.index") subreddit_map = {i: label for i, label in enumerate(subreddit_labels)} unique_subreddits = sorted(set(subreddit_labels)) def embed_text(text): tokens = clean_text(text).split() return embed_tokens(tokens, model).astype("float32") def search_reddit(query, selected_subreddit, top_k=5): query_vec = embed_text(query).reshape(1, -1) D, I = index.search(query_vec, top_k) results = [] for idx in I[0]: if idx < len(chunked_comments) and subreddit_map.get(idx) == selected_subreddit: results.append(f"🔸 {chunked_comments[idx]}") if len(results) >= top_k: break if not results: return "⚠️ No relevant results found." return "\n\n".join(results) # --- GRADIO APP --- with gr.Blocks(theme=gr.themes.Base(primary_hue="orange", secondary_hue="gray")) as demo: gr.Image( value="https://1000logos.net/wp-content/uploads/2017/05/Reddit-Logo.png", show_label=False, height=100 ) gr.Markdown("## 👾 Reddit Semantic Search (Powered by Word2Vec + FAISS)\n_Disclaimer: Experimental prototype, not owned/developed by Reddit Inc_") with gr.Row(): query = gr.Textbox(label="Enter your Reddit-like query", placeholder="e.g. What's new in AI?") subreddit_dropdown = gr.Dropdown(choices=unique_subreddits, label="Filter by Subreddit") output = gr.Textbox(label="Top Matching Chunks", lines=10) search_btn = gr.Button("🔍 Search") search_btn.click(fn=search_reddit, inputs=[query, subreddit_dropdown], outputs=output) demo.launch(share=True)