AA_Final1 / app.py
ahm14's picture
Update app.py
7ddff49 verified
import streamlit as st
import re
from langdetect import detect
from transformers import pipeline
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from docx import Document
import io
# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()
# Cache model to avoid reloading on every function call
@st.cache_resource
def load_pipeline():
return pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
tone_model = load_pipeline()
frame_model = load_pipeline()
# Updated tone categories
tone_categories = {
"Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis"],
"Critical": ["corrupt", "oppression", "failure", "repression", "unjust"],
"Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief"],
"Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change"],
"Informative": ["announcement", "event", "scheduled", "update", "details"],
"Positive": ["progress", "unity", "hope", "victory", "solidarity"],
"Urgent": ["urgent", "violence", "disappearances", "forced", "killing", "concern", "crisis"],
"Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust"],
"Negative": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief"],
"Empowering": ["rise", "resist", "mobilize", "inspire", "courage", "change"],
"Neutral": ["announcement", "event", "scheduled", "update", "details", "protest on"],
"Hopeful": ["progress", "unity", "hope", "victory", "together", "solidarity"]
}
# Updated frame categories (Limited to 4 selections)
frame_categories = {
"Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
"Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
"Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"],
"Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"],
"Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"],
"Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
"Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
"Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
"Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
"Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
"Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
"Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
"Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
"Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
"Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
}
# Language detection
def detect_language(text):
try:
return detect(text)
except Exception:
return "unknown"
# NLP-based keyword matching with lemmatization
def contains_keywords(text, keywords):
words = word_tokenize(text.lower())
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
return any(keyword in lemmatized_words for keyword in keywords)
# Analyze tone based on predefined categories
def analyze_tone(text):
detected_tones = set()
for category, keywords in tone_categories.items():
if contains_keywords(text, keywords):
detected_tones.add(category)
if not detected_tones:
model_result = tone_model(text, candidate_labels=list(tone_categories.keys()))
detected_tones.update(model_result["labels"][:2])
return list(detected_tones)
# Extract frames based on predefined categories (Limit to 4)
def extract_frames(text):
detected_frames = set()
for category, keywords in frame_categories.items():
if contains_keywords(text, keywords):
detected_frames.add(category)
if not detected_frames:
model_result = frame_model(text, candidate_labels=list(frame_categories.keys()))
detected_frames.update(model_result["labels"][:4])
return list(detected_frames)[:4] # Ensure no more than 4 frames are selected
# Extract hashtags
def extract_hashtags(text):
return re.findall(r"#\w+", text)
# Extract captions from DOCX file
def extract_captions_from_docx(docx_file):
doc = Document(docx_file)
captions = {}
current_post = None
for para in doc.paragraphs:
text = para.text.strip()
if re.match(r"Post \d+", text, re.IGNORECASE):
current_post = text
captions[current_post] = []
elif current_post:
captions[current_post].append(text)
return {post: " ".join(lines) for post, lines in captions.items() if lines}
# Generate a DOCX file in-memory
def generate_docx(output_data):
doc = Document()
doc.add_heading('Activism Message Analysis', 0)
for index, (caption, result) in enumerate(output_data.items(), start=1):
doc.add_heading(f"{index}. {caption}", level=1)
doc.add_paragraph("Full Caption:")
doc.add_paragraph(result['Full Caption'], style="Quote")
doc.add_paragraph(f"Language: {result['Language']}")
doc.add_paragraph(f"Tone of Caption: {', '.join(result['Tone of Caption'])}")
doc.add_paragraph(f"Number of Hashtags: {result['Hashtag Count']}")
doc.add_paragraph(f"Hashtags Found: {', '.join(result['Hashtags'])}")
doc.add_heading('Frames:', level=2)
for frame in result['Frames']:
doc.add_paragraph(frame)
doc_io = io.BytesIO()
doc.save(doc_io)
doc_io.seek(0)
return doc_io
# Streamlit app
st.title('AI-Powered Activism Message Analyzer')
st.write("Enter the text to analyze or upload a DOCX file containing captions:")
# Text Input
input_text = st.text_area("Input Text", height=200)
# File Upload
uploaded_file = st.file_uploader("Upload a DOCX file", type=["docx"])
# Initialize output dictionary
output_data = {}
if input_text:
language = detect_language(input_text)
tone = analyze_tone(input_text)
hashtags = extract_hashtags(input_text)
frames = extract_frames(input_text)
output_data["Manual Input"] = {
'Full Caption': input_text,
'Language': language,
'Tone of Caption': tone,
'Hashtags': hashtags,
'Hashtag Count': len(hashtags),
'Frames': frames
}
st.success("Analysis completed for text input.")
if uploaded_file:
captions = extract_captions_from_docx(uploaded_file)
for caption, text in captions.items():
language = detect_language(text)
tone = analyze_tone(text)
hashtags = extract_hashtags(text)
frames = extract_frames(text)
output_data[caption] = {
'Full Caption': text,
'Language': language,
'Tone of Caption': tone,
'Hashtags': hashtags,
'Hashtag Count': len(hashtags),
'Frames': frames
}
st.success(f"Analysis completed for {len(captions)} posts from the DOCX file.")
# Display results
if output_data:
with st.expander("Generated Output"):
st.subheader("Analysis Results")
for index, (caption, result) in enumerate(output_data.items(), start=1):
st.write(f"### {index}. {caption}")
st.write("**Full Caption:**")
st.write(f"> {result['Full Caption']}")
st.write(f"**Language**: {result['Language']}")
st.write(f"**Tone of Caption**: {', '.join(result['Tone of Caption'])}")
st.write(f"**Number of Hashtags**: {result['Hashtag Count']}")
st.write(f"**Hashtags Found:** {', '.join(result['Hashtags'])}")
st.write("**Frames**:")
for frame in result['Frames']:
st.write(f"- {frame}")
docx_file = generate_docx(output_data)
if docx_file:
st.download_button(
label="Download Analysis as DOCX",
data=docx_file,
file_name="activism_message_analysis.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)