Post_Extractor / app.py
ahm14's picture
Update app.py
f83098d verified
raw
history blame
4.66 kB
import gradio as gr
import requests
import pytesseract
from PIL import Image
import os
import docx
from transformers import pipeline
from keybert import KeyBERT
from io import BytesIO
# Set up Tesseract (ensure Tesseract is installed on your system)
# For Windows, specify the Tesseract path if needed:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# Set up AI models
emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student")
keyword_extractor = KeyBERT()
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# Function to extract text from image using Tesseract
def extract_text_from_image(image_url):
response = requests.get(image_url)
image = Image.open(BytesIO(response.content))
text = pytesseract.image_to_string(image)
return text.strip()
# Function to extract posts from social media (placeholder for actual scraping logic)
def extract_posts(profile_url, hashtags, num_posts):
# Placeholder for actual scraping logic
posts = [
{
"caption": "This is a sample post about climate change and environmental activism.",
"image_url": "https://example.com/sample_image.jpg",
"date": "2023-10-01",
"likes": 100,
"comments": 20,
},
{
"caption": "Another post about technology and innovation in 2023.",
"image_url": "",
"date": "2023-10-02",
"likes": 50,
"comments": 10,
},
]
return posts[:num_posts]
# Function to categorize post using Zero-Shot Classification
def categorize_post(caption):
categories = ["activism", "politics", "social issues", "technology", "environment", "health"]
result = zero_shot_classifier(caption, candidate_labels=categories)
return result["labels"][0] # Return the most likely category
# Function to analyze detailed sentiment using RoBERTa-based emotion classifier
def analyze_sentiment(caption):
emotions = emotion_classifier(caption, top_k=None)
top_emotions = sorted(emotions, key=lambda x: x["score"], reverse=True)[:3] # Top 3 emotions
return top_emotions
# Function to process posts and generate Word document
def process_posts(profile_url, hashtags, num_posts):
hashtags = [tag.strip() for tag in hashtags.split(",")]
posts = extract_posts(profile_url, hashtags, num_posts)
doc = docx.Document()
doc.add_heading("Extracted Social Media Posts", 0)
for i, post in enumerate(posts):
doc.add_heading(f"Post {i+1}", level=1)
doc.add_paragraph(f"Date: {post['date']}")
doc.add_paragraph(f"Likes: {post['likes']}")
doc.add_paragraph(f"Comments: {post['comments']}")
doc.add_paragraph(f"Caption: {post['caption']}")
# Extract text from image using Tesseract
if post["image_url"]:
extracted_text = extract_text_from_image(post["image_url"])
doc.add_paragraph(f"Extracted Text from Image: {extracted_text}")
# Detailed Sentiment Analysis using RoBERTa-based emotion classifier
emotions = analyze_sentiment(post["caption"])
emotion_text = ", ".join([f"{e['label']} ({e['score']:.2f})" for e in emotions])
doc.add_paragraph(f"Top Emotions: {emotion_text}")
# Keyword Extraction
keywords = keyword_extractor.extract_keywords(post["caption"], keyphrase_ngram_range=(1, 2), stop_words="english")
doc.add_paragraph(f"Extracted Keywords: {', '.join([kw[0] for kw in keywords])}")
# Frame Extraction (Context/Category)
category = categorize_post(post["caption"])
doc.add_paragraph(f"Category/Frame: {category}")
doc_path = "extracted_posts.docx"
doc.save(doc_path)
return doc_path
# Gradio Interface
def gradio_app(profile_url, hashtags, num_posts):
try:
doc_path = process_posts(profile_url, hashtags, num_posts)
return doc_path
except Exception as e:
return f"Error: {str(e)}"
# Gradio UI
iface = gr.Interface(
fn=gradio_app,
inputs=[
gr.Textbox(label="Social Media Profile URL"),
gr.Textbox(label="Hashtags (comma-separated)"),
gr.Number(label="Number of Posts to Extract", precision=0),
],
outputs=gr.File(label="Download Extracted Posts"),
title="Social Media Post Extractor",
description="Extract posts from a social media profile based on hashtags. Includes OCR (Tesseract), detailed sentiment analysis (RoBERTa), keyword extraction, and frame (context/category) extraction.",
)
iface.launch()