Spaces:
Runtime error
Runtime error
import gradio as gr | |
import requests | |
import pytesseract | |
from PIL import Image | |
import os | |
import docx | |
from transformers import pipeline | |
from keybert import KeyBERT | |
from io import BytesIO | |
# Set up Tesseract (ensure Tesseract is installed on your system) | |
# For Windows, specify the Tesseract path if needed: | |
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' | |
# Set up AI models | |
emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student") | |
keyword_extractor = KeyBERT() | |
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
# Function to extract text from image using Tesseract | |
def extract_text_from_image(image_url): | |
response = requests.get(image_url) | |
image = Image.open(BytesIO(response.content)) | |
text = pytesseract.image_to_string(image) | |
return text.strip() | |
# Function to extract posts from social media (placeholder for actual scraping logic) | |
def extract_posts(profile_url, hashtags, num_posts): | |
# Placeholder for actual scraping logic | |
posts = [ | |
{ | |
"caption": "This is a sample post about climate change and environmental activism.", | |
"image_url": "https://example.com/sample_image.jpg", | |
"date": "2023-10-01", | |
"likes": 100, | |
"comments": 20, | |
}, | |
{ | |
"caption": "Another post about technology and innovation in 2023.", | |
"image_url": "", | |
"date": "2023-10-02", | |
"likes": 50, | |
"comments": 10, | |
}, | |
] | |
return posts[:num_posts] | |
# Function to categorize post using Zero-Shot Classification | |
def categorize_post(caption): | |
categories = ["activism", "politics", "social issues", "technology", "environment", "health"] | |
result = zero_shot_classifier(caption, candidate_labels=categories) | |
return result["labels"][0] # Return the most likely category | |
# Function to analyze detailed sentiment using RoBERTa-based emotion classifier | |
def analyze_sentiment(caption): | |
emotions = emotion_classifier(caption, top_k=None) | |
top_emotions = sorted(emotions, key=lambda x: x["score"], reverse=True)[:3] # Top 3 emotions | |
return top_emotions | |
# Function to process posts and generate Word document | |
def process_posts(profile_url, hashtags, num_posts): | |
hashtags = [tag.strip() for tag in hashtags.split(",")] | |
posts = extract_posts(profile_url, hashtags, num_posts) | |
doc = docx.Document() | |
doc.add_heading("Extracted Social Media Posts", 0) | |
for i, post in enumerate(posts): | |
doc.add_heading(f"Post {i+1}", level=1) | |
doc.add_paragraph(f"Date: {post['date']}") | |
doc.add_paragraph(f"Likes: {post['likes']}") | |
doc.add_paragraph(f"Comments: {post['comments']}") | |
doc.add_paragraph(f"Caption: {post['caption']}") | |
# Extract text from image using Tesseract | |
if post["image_url"]: | |
extracted_text = extract_text_from_image(post["image_url"]) | |
doc.add_paragraph(f"Extracted Text from Image: {extracted_text}") | |
# Detailed Sentiment Analysis using RoBERTa-based emotion classifier | |
emotions = analyze_sentiment(post["caption"]) | |
emotion_text = ", ".join([f"{e['label']} ({e['score']:.2f})" for e in emotions]) | |
doc.add_paragraph(f"Top Emotions: {emotion_text}") | |
# Keyword Extraction | |
keywords = keyword_extractor.extract_keywords(post["caption"], keyphrase_ngram_range=(1, 2), stop_words="english") | |
doc.add_paragraph(f"Extracted Keywords: {', '.join([kw[0] for kw in keywords])}") | |
# Frame Extraction (Context/Category) | |
category = categorize_post(post["caption"]) | |
doc.add_paragraph(f"Category/Frame: {category}") | |
doc_path = "extracted_posts.docx" | |
doc.save(doc_path) | |
return doc_path | |
# Gradio Interface | |
def gradio_app(profile_url, hashtags, num_posts): | |
try: | |
doc_path = process_posts(profile_url, hashtags, num_posts) | |
return doc_path | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Gradio UI | |
iface = gr.Interface( | |
fn=gradio_app, | |
inputs=[ | |
gr.Textbox(label="Social Media Profile URL"), | |
gr.Textbox(label="Hashtags (comma-separated)"), | |
gr.Number(label="Number of Posts to Extract", precision=0), | |
], | |
outputs=gr.File(label="Download Extracted Posts"), | |
title="Social Media Post Extractor", | |
description="Extract posts from a social media profile based on hashtags. Includes OCR (Tesseract), detailed sentiment analysis (RoBERTa), keyword extraction, and frame (context/category) extraction.", | |
) | |
iface.launch() |