GenAIDevTOProd commited on
Commit
c75ec66
·
verified ·
1 Parent(s): 7b8fa4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -23
app.py CHANGED
@@ -15,30 +15,24 @@ from huggingface_hub import HfApi
15
  # Load token from Hugging Face Secrets
16
  HF_TOKEN = os.environ.get("RedditSemanticSearch")
17
 
18
- # Function to stream JSONL Reddit files from HF Hub
19
  from datasets import load_dataset
 
20
 
21
  # Define target subreddits
22
  target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]
23
 
24
- # Load full Reddit dataset (assumes it's pre-split by subreddit or has a field)
25
- dataset_splits = [load_dataset("HuggingFaceGECLM/REDDIT_comments", split=sub, streaming=True) for sub in target_subreddits]
26
-
27
- # Filter only relevant subreddits
28
- dataset = dataset.filter(lambda x: x["subreddit"] in target_subreddits)
29
-
30
- # Take a sample (to limit memory for now)
31
- comments = [{"body": ex["body"]} for ex in dataset.select(range(100000))]
32
 
33
- import pandas as pd
34
- import re
35
- from itertools import islice
36
 
37
- # Load a sample of the dataset (e.g., 100,000 records for performance)
38
- comments = [{"body": ex["body"]} for ex in islice(combined_dataset, 100000)]
39
 
40
- # Convert to DataFrame
41
- df = pd.DataFrame(comments)
42
 
43
  # Clean text function
44
  def clean_body(text):
@@ -59,13 +53,8 @@ df_chunked.rename(columns={"clean": "chunk_text"}, inplace=True)
59
  # Final list for embedding
60
  chunked_comments = df_chunked["chunk_text"].tolist()
61
 
62
- # Create subreddit labels
63
- combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
64
- subreddit_labels = []
65
- for example in combined_dataset:
66
- subreddit_labels.append(example["subreddit_name_prefixed"])
67
- if len(subreddit_labels) >= len(chunked_comments):
68
- break
69
 
70
  # Tokenize
71
  def clean_text(text):
 
15
  # Load token from Hugging Face Secrets
16
  HF_TOKEN = os.environ.get("RedditSemanticSearch")
17
 
 
18
  from datasets import load_dataset
19
+ from itertools import islice
20
 
21
  # Define target subreddits
22
  target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]
23
 
24
+ # Load streaming dataset for each subreddit and combine
25
+ def stream_subreddit_data(subreddit):
26
+ return load_dataset("HuggingFaceGECLM/REDDIT_comments", split=subreddit, streaming=True)
 
 
 
 
 
27
 
28
+ # Combine streams
29
+ combined_dataset = chain(*(stream_subreddit_data(sub) for sub in target_subreddits))
 
30
 
31
+ # Sample up to 100,000 comments from the combined stream
32
+ comments = list(islice(combined_dataset, 100000))
33
 
34
+ # Extract text and subreddit
35
+ df = pd.DataFrame([{"body": ex["body"], "subreddit": ex["subreddit"]} for ex in comments])
36
 
37
  # Clean text function
38
  def clean_body(text):
 
53
  # Final list for embedding
54
  chunked_comments = df_chunked["chunk_text"].tolist()
55
 
56
+ # Create subreddit labels (reused from original list)
57
+ subreddit_labels = df["subreddit"].tolist()[:len(chunked_comments)]
 
 
 
 
 
58
 
59
  # Tokenize
60
  def clean_text(text):