Spaces:

mangoesai
/

Refresh_Praw_pinecone_dataset

Sleeping

App Files Files Community

Vera-ZWY commited on Nov 19, 2024

Commit

513d672

verified ·

1 Parent(s): f957c0d

Create praw_newgest_df2024.py

Browse files

Files changed (1) hide show

praw_newgest_df2024.py +246 -0

praw_newgest_df2024.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# import gradio as gr
+import numpy as np
+import pandas as pd
+import praw
+from huggingface_hub import HfApi, HfFolder
+import time
+import os
+from datetime import datetime
+# from tqdm import tqdm
+HfFolder.save_token(os.getenv("HF_TOKEN"))
+try:
+# def initialize_reddit():
+    reddit = praw.Reddit(client_id= os.getenv("PRAW_CLIENT_ID"),
+                         client_secret= os.getenv("PRAW_CLIENT_SECRET"),
+                         user_agent= os.getenv("RPAW_AGENT"),
+                         check_for_async=False
+                          )
+except praw.exceptions.PRAWException as e:
+    print(f"PRAW Exception: {str(e)}")
+    # return None
+except Exception as e:
+    print(f"An error occurred: {str(e)}")
+    # return None
+def scrape_reddit(subreddit_name = None, keywords = None, limit = 1000):
+    posts_data = []
+    if subreddit_name:
+        subreddit = reddit.subreddit(subreddit_name)
+        if keywords:
+            posts = subreddit.search(keywords, limit=limit)
+        else:
+            posts = subreddit.hot(limit=limit)
+    else:
+        posts = reddit.subreddit("all").search(keywords, limit=limit)
+    # print(posts)
+    for post in posts:
+        # print(post.title)
+        try:
+            post_data = {
+                "title": post.title,
+                "score": post.score,
+                "id": post.id,
+                "url": post.url,
+                "num_comments": post.num_comments,
+                "created": datetime.fromtimestamp(post.created),
+                "body": post.selftext,
+                "subreddit": post.subreddit.display_name
+            }
+            posts_data.append(post_data)
+            # Add a small delay to avoid hitting rate limits
+            time.sleep(0.1)
+        except praw.exceptions.PRAWException as e:
+            print(f"Error processing post {post.id}: {str(e)}")
+            continue
+    df = pd.DataFrame(posts_data)
+    df['content'] = df['title'] + '\n' + df['body']
+    return df
+def get_comments(reddit, post_id, limit=100):
+    """
+    Get top comments from a specific post.
+    Args:
+        reddit: Reddit instance
+        post_id (str): ID of the post to get comments from
+        limit (int): Maximum number of comments to retrieve (default 100)
+    Returns:
+        pd.DataFrame: DataFrame containing top comments data
+    """
+    try:
+        submission = reddit.submission(id=post_id)
+        comments_data = []
+        # Replace MoreComments objects with actual comments, limited to save time
+        submission.comments.replace_more(limit=0)  # Ignore "More Comments" expansions
+        # Get all top-level comments
+        all_comments = submission.comments.list()
+        # Sort comments by score and take top ones
+        sorted_comments = sorted(all_comments, key=lambda x: x.score, reverse=True)[:limit]
+        for comment in sorted_comments:
+            try:
+                comment_data = {
+                    'comment_id': comment.id,
+                    'post_id': post_id,
+                    'post_title': submission.title,
+                    # 'author': str(comment.author) if comment.author else '[deleted]',
+                    'body': comment.body,
+                    'score': comment.score,
+                    'created_utc': datetime.fromtimestamp(comment.created_utc)
+                    # 'parent_id': comment.parent_id,
+                    # 'is_submitter': comment.is_submitter
+                }
+                comments_data.append(comment_data)
+            except Exception as e:
+                print(f"Error processing comment {comment.id}: {str(e)}")
+                continue
+        print(comments_data)
+        # Create DataFrame
+        df = pd.DataFrame(comments_data)
+        # Sort by score (highest first)
+        if not df.empty:
+            print("sort comments by score")
+            df = df.sort_values('score', ascending=False)
+        return df
+    except praw.exceptions.PRAWException as e:
+        print(f"PRAW Exception while getting comments: {str(e)}")
+        return pd.DataFrame()
+    except Exception as e:
+        print(f"Error getting comments: {str(e)}")
+        return pd.DataFrame()
+def get_comments_and_upload(df, dataset_repo_id):
+    # Initialize the Hugging Face API
+    api = HfApi()
+    existing_files = api.list_repo_files(repo_id=dataset_repo_id, repo_type="dataset")
+    # Iterate over each submission in the DataFrame
+    for index, row in df.iterrows():
+        csv_file_path = f"comments_{row['id']}.csv"
+        repo_csv_path = f"comments/{csv_file_path}"
+        # Check if this file already exists in the Hugging Face dataset
+        # if repo_csv_path in existing_files:
+        #     print(f"{csv_file_path} already exists in the dataset. Skipping upload.")
+        #     continue
+        # Fetch comments for the current submission
+        comments_df = get_comments(reddit, row['id'])
+        # # Prepare data for the current submission’s comments
+        # comments_data = [{
+        #     'comment_id': comment.id,
+        #     'comment_content': comment.body,
+        #     'comment_created': comment.created,
+        #     'submission_id': row['id']
+        # } for comment in comments]
+        # Create a DataFrame for the current submission's comments
+        # comments_df = pd.DataFrame(comments_data, columns=['comment_id', 'comment_content', 'comment_created', 'submission_id'])
+        if len(comments_df) == 0:
+            print(f"No comments found for {row['id']}")
+            # continue
+        # Define a unique CSV filename for each submission based on its ID
+        csv_file_path = f"comments_{row['id']}.csv"
+        # Save the comments DataFrame as a CSV file
+        comments_df.to_csv(csv_file_path, index=False)
+        # Upload the CSV file to the Hugging Face dataset repository
+        api.upload_file(
+            path_or_fileobj=csv_file_path,
+            path_in_repo=f"comments/{csv_file_path}",  # Save in a 'comments' folder in the dataset repo
+            repo_id=dataset_repo_id,
+            repo_type="dataset"
+        )
+        print(f"Uploaded {csv_file_path} to Hugging Face.")
+        # Optionally, delete the local CSV file to save space
+        os.remove(csv_file_path)
+    print("All comments CSV files uploaded successfully!")
+def main():
+    # Example usage
+    try:
+        # Search for 2016 election posts
+        df = scrape_reddit(keywords="election")
+        if df is not None and not df.empty:
+            print(f"Successfully scraped {len(df)} posts")
+            # Save to CSV
+            # df.to_csv("reddit_2016_election_posts.csv", index=False)
+            df['created'] = pd.to_datetime(df['created'], unit='s')
+            df = df.sort_values(by='created', ascending=True)
+            df_24 = df[df['created'] > '2024-01-01'].reset_index(drop=True)
+            # df_16 = df_16[df_16['created'] > '2015-12-31'].reset_index(drop=True)
+            dataset_repo_id = "Vera-ZWY/reddite2024elections_submissions"
+            # reate database if it's not exsit
+            api = HfApi()
+            try:
+                api.dataset_info(dataset_repo_id)
+                # dataset_exists = True
+                print(f"Dataset {dataset_repo_id} already exists.")
+            except Exception:
+            # dataset_exists = False
+                print(f"Dataset {dataset_repo_id} will be created.")
+                # If the dataset doesn't exist, create it and then upload the CSV file
+                # api.create_repo(repo_id=dataset_repo_id, repo_type="dataset")
+            df_24.to_csv("df_24.csv", index=False)
+            csv_file_path = "df_24.csv"
+            api.upload_file(
+                path_or_fileobj= csv_file_path,
+                path_in_repo="df_24_newest.csv",
+                repo_id=dataset_repo_id,
+                repo_type="dataset"
+            )
+            get_comments_and_upload(df_24, dataset_repo_id)
+        else:
+            print("No data was retrieved")
+    except Exception as e:
+        print(f"Error in main: {str(e)}")
+if __name__ == '__main__':
+    main()