import gradio as gr import requests import pytesseract from PIL import Image import os import docx from transformers import pipeline from keybert import KeyBERT from io import BytesIO # Set up Tesseract (ensure Tesseract is installed on your system) # For Windows, specify the Tesseract path if needed: # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Set up AI models emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student") keyword_extractor = KeyBERT() zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") # Function to extract text from image using Tesseract def extract_text_from_image(image_url): response = requests.get(image_url) image = Image.open(BytesIO(response.content)) text = pytesseract.image_to_string(image) return text.strip() # Function to extract posts from social media (placeholder for actual scraping logic) def extract_posts(profile_url, hashtags, num_posts): # Placeholder for actual scraping logic posts = [ { "caption": "This is a sample post about climate change and environmental activism.", "image_url": "https://example.com/sample_image.jpg", "date": "2023-10-01", "likes": 100, "comments": 20, }, { "caption": "Another post about technology and innovation in 2023.", "image_url": "", "date": "2023-10-02", "likes": 50, "comments": 10, }, ] return posts[:num_posts] # Function to categorize post using Zero-Shot Classification def categorize_post(caption): categories = ["activism", "politics", "social issues", "technology", "environment", "health"] result = zero_shot_classifier(caption, candidate_labels=categories) return result["labels"][0] # Return the most likely category # Function to analyze detailed sentiment using RoBERTa-based emotion classifier def analyze_sentiment(caption): emotions = emotion_classifier(caption, top_k=None) top_emotions = sorted(emotions, key=lambda x: x["score"], reverse=True)[:3] # Top 3 emotions return top_emotions # Function to process posts and generate Word document def process_posts(profile_url, hashtags, num_posts): hashtags = [tag.strip() for tag in hashtags.split(",")] posts = extract_posts(profile_url, hashtags, num_posts) doc = docx.Document() doc.add_heading("Extracted Social Media Posts", 0) for i, post in enumerate(posts): doc.add_heading(f"Post {i+1}", level=1) doc.add_paragraph(f"Date: {post['date']}") doc.add_paragraph(f"Likes: {post['likes']}") doc.add_paragraph(f"Comments: {post['comments']}") doc.add_paragraph(f"Caption: {post['caption']}") # Extract text from image using Tesseract if post["image_url"]: extracted_text = extract_text_from_image(post["image_url"]) doc.add_paragraph(f"Extracted Text from Image: {extracted_text}") # Detailed Sentiment Analysis using RoBERTa-based emotion classifier emotions = analyze_sentiment(post["caption"]) emotion_text = ", ".join([f"{e['label']} ({e['score']:.2f})" for e in emotions]) doc.add_paragraph(f"Top Emotions: {emotion_text}") # Keyword Extraction keywords = keyword_extractor.extract_keywords(post["caption"], keyphrase_ngram_range=(1, 2), stop_words="english") doc.add_paragraph(f"Extracted Keywords: {', '.join([kw[0] for kw in keywords])}") # Frame Extraction (Context/Category) category = categorize_post(post["caption"]) doc.add_paragraph(f"Category/Frame: {category}") doc_path = "extracted_posts.docx" doc.save(doc_path) return doc_path # Gradio Interface def gradio_app(profile_url, hashtags, num_posts): try: doc_path = process_posts(profile_url, hashtags, num_posts) return doc_path except Exception as e: return f"Error: {str(e)}" # Gradio UI iface = gr.Interface( fn=gradio_app, inputs=[ gr.Textbox(label="Social Media Profile URL"), gr.Textbox(label="Hashtags (comma-separated)"), gr.Number(label="Number of Posts to Extract", precision=0), ], outputs=gr.File(label="Download Extracted Posts"), title="Social Media Post Extractor", description="Extract posts from a social media profile based on hashtags. Includes OCR (Tesseract), detailed sentiment analysis (RoBERTa), keyword extraction, and frame (context/category) extraction.", ) iface.launch()