Spaces:

ahm14
/

Post_Extractor

Runtime error

App Files Files Community

ahm14 commited on Feb 4

Commit

103d063

verified ·

1 Parent(s): f24a1ab

Create app.py

Browse files

Files changed (1) hide show

app.py +118 -0

app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import gradio as gr
+import requests
+import pytesseract
+from PIL import Image
+import os
+import docx
+from transformers import pipeline
+from keybert import KeyBERT
+from io import BytesIO
+# Set up Tesseract (ensure Tesseract is installed on your system)
+# For Windows, specify the Tesseract path if needed:
+# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+# Set up AI models
+emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions")
+keyword_extractor = KeyBERT()
+zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+# Function to extract text from image using Tesseract
+def extract_text_from_image(image_url):
+    response = requests.get(image_url)
+    image = Image.open(BytesIO(response.content))
+    text = pytesseract.image_to_string(image)
+    return text.strip()
+# Function to extract posts from social media (placeholder for actual scraping logic)
+def extract_posts(profile_url, hashtags, num_posts):
+    # Placeholder for actual scraping logic
+    posts = [
+        {
+            "caption": "This is a sample post about climate change and environmental activism.",
+            "image_url": "https://example.com/sample_image.jpg",
+            "date": "2023-10-01",
+            "likes": 100,
+            "comments": 20,
+        },
+        {
+            "caption": "Another post about technology and innovation in 2023.",
+            "image_url": "",
+            "date": "2023-10-02",
+            "likes": 50,
+            "comments": 10,
+        },
+    ]
+    return posts[:num_posts]
+# Function to categorize post using Zero-Shot Classification
+def categorize_post(caption):
+    categories = ["activism", "politics", "social issues", "technology", "environment", "health"]
+    result = zero_shot_classifier(caption, candidate_labels=categories)
+    return result["labels"][0]  # Return the most likely category
+# Function to analyze detailed sentiment using RoBERTa-based emotion classifier
+def analyze_sentiment(caption):
+    emotions = emotion_classifier(caption, top_k=None)
+    top_emotions = sorted(emotions, key=lambda x: x["score"], reverse=True)[:3]  # Top 3 emotions
+    return top_emotions
+# Function to process posts and generate Word document
+def process_posts(profile_url, hashtags, num_posts):
+    hashtags = [tag.strip() for tag in hashtags.split(",")]
+    posts = extract_posts(profile_url, hashtags, num_posts)
+    doc = docx.Document()
+    doc.add_heading("Extracted Social Media Posts", 0)
+    for i, post in enumerate(posts):
+        doc.add_heading(f"Post {i+1}", level=1)
+        doc.add_paragraph(f"Date: {post['date']}")
+        doc.add_paragraph(f"Likes: {post['likes']}")
+        doc.add_paragraph(f"Comments: {post['comments']}")
+        doc.add_paragraph(f"Caption: {post['caption']}")
+        # Extract text from image using Tesseract
+        if post["image_url"]:
+            extracted_text = extract_text_from_image(post["image_url"])
+            doc.add_paragraph(f"Extracted Text from Image: {extracted_text}")
+        # Detailed Sentiment Analysis using RoBERTa-based emotion classifier
+        emotions = analyze_sentiment(post["caption"])
+        emotion_text = ", ".join([f"{e['label']} ({e['score']:.2f})" for e in emotions])
+        doc.add_paragraph(f"Top Emotions: {emotion_text}")
+        # Keyword Extraction
+        keywords = keyword_extractor.extract_keywords(post["caption"], keyphrase_ngram_range=(1, 2), stop_words="english")
+        doc.add_paragraph(f"Extracted Keywords: {', '.join([kw[0] for kw in keywords])}")
+        # Frame Extraction (Context/Category)
+        category = categorize_post(post["caption"])
+        doc.add_paragraph(f"Category/Frame: {category}")
+    doc_path = "extracted_posts.docx"
+    doc.save(doc_path)
+    return doc_path
+# Gradio Interface
+def gradio_app(profile_url, hashtags, num_posts):
+    try:
+        doc_path = process_posts(profile_url, hashtags, num_posts)
+        return doc_path
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Gradio UI
+iface = gr.Interface(
+    fn=gradio_app,
+    inputs=[
+        gr.Textbox(label="Social Media Profile URL"),
+        gr.Textbox(label="Hashtags (comma-separated)"),
+        gr.Number(label="Number of Posts to Extract", precision=0),
+    ],
+    outputs=gr.File(label="Download Extracted Posts"),
+    title="Social Media Post Extractor",
+    description="Extract posts from a social media profile based on hashtags. Includes OCR (Tesseract), detailed sentiment analysis (RoBERTa), keyword extraction, and frame (context/category) extraction.",
+)
+iface.launch()