Spaces:

ahm14
/

Post_Extractor

Runtime error

File size: 4,656 Bytes

import gradio as gr
import requests
import pytesseract
from PIL import Image
import os
import docx
from transformers import pipeline
from keybert import KeyBERT
from io import BytesIO

# Set up Tesseract (ensure Tesseract is installed on your system)
# For Windows, specify the Tesseract path if needed:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Set up AI models
emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student")
keyword_extractor = KeyBERT()
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Function to extract text from image using Tesseract
def extract_text_from_image(image_url):
    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content))
    text = pytesseract.image_to_string(image)
    return text.strip()

# Function to extract posts from social media (placeholder for actual scraping logic)
def extract_posts(profile_url, hashtags, num_posts):
    # Placeholder for actual scraping logic
    posts = [
        {
            "caption": "This is a sample post about climate change and environmental activism.",
            "image_url": "https://example.com/sample_image.jpg",
            "date": "2023-10-01",
            "likes": 100,
            "comments": 20,
        },
        {
            "caption": "Another post about technology and innovation in 2023.",
            "image_url": "",
            "date": "2023-10-02",
            "likes": 50,
            "comments": 10,
        },
    ]
    return posts[:num_posts]

# Function to categorize post using Zero-Shot Classification
def categorize_post(caption):
    categories = ["activism", "politics", "social issues", "technology", "environment", "health"]
    result = zero_shot_classifier(caption, candidate_labels=categories)
    return result["labels"][0]  # Return the most likely category

# Function to analyze detailed sentiment using RoBERTa-based emotion classifier
def analyze_sentiment(caption):
    emotions = emotion_classifier(caption, top_k=None)
    top_emotions = sorted(emotions, key=lambda x: x["score"], reverse=True)[:3]  # Top 3 emotions
    return top_emotions

# Function to process posts and generate Word document
def process_posts(profile_url, hashtags, num_posts):
    hashtags = [tag.strip() for tag in hashtags.split(",")]
    posts = extract_posts(profile_url, hashtags, num_posts)

    doc = docx.Document()
    doc.add_heading("Extracted Social Media Posts", 0)

    for i, post in enumerate(posts):
        doc.add_heading(f"Post {i+1}", level=1)
        doc.add_paragraph(f"Date: {post['date']}")
        doc.add_paragraph(f"Likes: {post['likes']}")
        doc.add_paragraph(f"Comments: {post['comments']}")
        doc.add_paragraph(f"Caption: {post['caption']}")

        # Extract text from image using Tesseract
        if post["image_url"]:
            extracted_text = extract_text_from_image(post["image_url"])
            doc.add_paragraph(f"Extracted Text from Image: {extracted_text}")

        # Detailed Sentiment Analysis using RoBERTa-based emotion classifier
        emotions = analyze_sentiment(post["caption"])
        emotion_text = ", ".join([f"{e['label']} ({e['score']:.2f})" for e in emotions])
        doc.add_paragraph(f"Top Emotions: {emotion_text}")

        # Keyword Extraction
        keywords = keyword_extractor.extract_keywords(post["caption"], keyphrase_ngram_range=(1, 2), stop_words="english")
        doc.add_paragraph(f"Extracted Keywords: {', '.join([kw[0] for kw in keywords])}")

        # Frame Extraction (Context/Category)
        category = categorize_post(post["caption"])
        doc.add_paragraph(f"Category/Frame: {category}")

    doc_path = "extracted_posts.docx"
    doc.save(doc_path)
    return doc_path

# Gradio Interface
def gradio_app(profile_url, hashtags, num_posts):
    try:
        doc_path = process_posts(profile_url, hashtags, num_posts)
        return doc_path
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio UI
iface = gr.Interface(
    fn=gradio_app,
    inputs=[
        gr.Textbox(label="Social Media Profile URL"),
        gr.Textbox(label="Hashtags (comma-separated)"),
        gr.Number(label="Number of Posts to Extract", precision=0),
    ],
    outputs=gr.File(label="Download Extracted Posts"),
    title="Social Media Post Extractor",
    description="Extract posts from a social media profile based on hashtags. Includes OCR (Tesseract), detailed sentiment analysis (RoBERTa), keyword extraction, and frame (context/category) extraction.",
)

iface.launch()