GenAIDevTOProd commited on
Commit
b661309
·
verified ·
1 Parent(s): cc1c9e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -11
app.py CHANGED
@@ -19,17 +19,16 @@ HF_TOKEN = os.environ.get("RedditSemanticSearch")
19
  target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]
20
 
21
  # Function to stream JSONL Reddit files from HF Hub
22
- def load_reddit_split(subreddit_name):
23
- file_path = hf_hub_download(
24
- repo_id="HuggingFaceGECLM/REDDIT_comments",
25
- filename=f"{subreddit_name}.jsonl"
26
- )
27
- with open(file_path, "r") as f:
28
- for line in f:
29
- yield json.loads(line)
30
-
31
- # Combine subreddit data
32
- combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
33
 
34
  import pandas as pd
35
  import re
 
19
  target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]
20
 
21
  # Function to stream JSONL Reddit files from HF Hub
22
+ from datasets import load_dataset
23
+
24
+ # Load full Reddit dataset (assumes it's pre-split by subreddit or has a field)
25
+ dataset = load_dataset("HuggingFaceGECLM/REDDIT_comments", split="train")
26
+
27
+ # Filter only relevant subreddits
28
+ dataset = dataset.filter(lambda x: x["subreddit"] in target_subreddits)
29
+
30
+ # Take a sample (to limit memory for now)
31
+ comments = [{"body": ex["body"]} for ex in dataset.select(range(100000))]
 
32
 
33
  import pandas as pd
34
  import re