Spaces:
Runtime error
Runtime error
File size: 4,656 Bytes
103d063 f83098d 103d063 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import gradio as gr
import requests
import pytesseract
from PIL import Image
import os
import docx
from transformers import pipeline
from keybert import KeyBERT
from io import BytesIO
# Set up Tesseract (ensure Tesseract is installed on your system)
# For Windows, specify the Tesseract path if needed:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# Set up AI models
emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student")
keyword_extractor = KeyBERT()
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# Function to extract text from image using Tesseract
def extract_text_from_image(image_url):
response = requests.get(image_url)
image = Image.open(BytesIO(response.content))
text = pytesseract.image_to_string(image)
return text.strip()
# Function to extract posts from social media (placeholder for actual scraping logic)
def extract_posts(profile_url, hashtags, num_posts):
# Placeholder for actual scraping logic
posts = [
{
"caption": "This is a sample post about climate change and environmental activism.",
"image_url": "https://example.com/sample_image.jpg",
"date": "2023-10-01",
"likes": 100,
"comments": 20,
},
{
"caption": "Another post about technology and innovation in 2023.",
"image_url": "",
"date": "2023-10-02",
"likes": 50,
"comments": 10,
},
]
return posts[:num_posts]
# Function to categorize post using Zero-Shot Classification
def categorize_post(caption):
categories = ["activism", "politics", "social issues", "technology", "environment", "health"]
result = zero_shot_classifier(caption, candidate_labels=categories)
return result["labels"][0] # Return the most likely category
# Function to analyze detailed sentiment using RoBERTa-based emotion classifier
def analyze_sentiment(caption):
emotions = emotion_classifier(caption, top_k=None)
top_emotions = sorted(emotions, key=lambda x: x["score"], reverse=True)[:3] # Top 3 emotions
return top_emotions
# Function to process posts and generate Word document
def process_posts(profile_url, hashtags, num_posts):
hashtags = [tag.strip() for tag in hashtags.split(",")]
posts = extract_posts(profile_url, hashtags, num_posts)
doc = docx.Document()
doc.add_heading("Extracted Social Media Posts", 0)
for i, post in enumerate(posts):
doc.add_heading(f"Post {i+1}", level=1)
doc.add_paragraph(f"Date: {post['date']}")
doc.add_paragraph(f"Likes: {post['likes']}")
doc.add_paragraph(f"Comments: {post['comments']}")
doc.add_paragraph(f"Caption: {post['caption']}")
# Extract text from image using Tesseract
if post["image_url"]:
extracted_text = extract_text_from_image(post["image_url"])
doc.add_paragraph(f"Extracted Text from Image: {extracted_text}")
# Detailed Sentiment Analysis using RoBERTa-based emotion classifier
emotions = analyze_sentiment(post["caption"])
emotion_text = ", ".join([f"{e['label']} ({e['score']:.2f})" for e in emotions])
doc.add_paragraph(f"Top Emotions: {emotion_text}")
# Keyword Extraction
keywords = keyword_extractor.extract_keywords(post["caption"], keyphrase_ngram_range=(1, 2), stop_words="english")
doc.add_paragraph(f"Extracted Keywords: {', '.join([kw[0] for kw in keywords])}")
# Frame Extraction (Context/Category)
category = categorize_post(post["caption"])
doc.add_paragraph(f"Category/Frame: {category}")
doc_path = "extracted_posts.docx"
doc.save(doc_path)
return doc_path
# Gradio Interface
def gradio_app(profile_url, hashtags, num_posts):
try:
doc_path = process_posts(profile_url, hashtags, num_posts)
return doc_path
except Exception as e:
return f"Error: {str(e)}"
# Gradio UI
iface = gr.Interface(
fn=gradio_app,
inputs=[
gr.Textbox(label="Social Media Profile URL"),
gr.Textbox(label="Hashtags (comma-separated)"),
gr.Number(label="Number of Posts to Extract", precision=0),
],
outputs=gr.File(label="Download Extracted Posts"),
title="Social Media Post Extractor",
description="Extract posts from a social media profile based on hashtags. Includes OCR (Tesseract), detailed sentiment analysis (RoBERTa), keyword extraction, and frame (context/category) extraction.",
)
iface.launch() |