ahm14 commited on
Commit
7ddff49
·
verified ·
1 Parent(s): 706fc89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -19
app.py CHANGED
@@ -3,11 +3,25 @@ import re
3
  from langdetect import detect
4
  from transformers import pipeline
5
  import nltk
 
 
6
  from docx import Document
7
  import io
8
 
9
  # Download required NLTK resources
10
  nltk.download('punkt')
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # Updated tone categories
13
  tone_categories = {
@@ -25,7 +39,7 @@ tone_categories = {
25
  "Hopeful": ["progress", "unity", "hope", "victory", "together", "solidarity"]
26
  }
27
 
28
- # Updated frame categories
29
  frame_categories = {
30
  "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
31
  "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
@@ -44,47 +58,50 @@ frame_categories = {
44
  "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
45
  }
46
 
47
- # Detect language
48
  def detect_language(text):
49
  try:
50
  return detect(text)
51
- except Exception as e:
52
- st.write(f"Error detecting language: {e}")
53
  return "unknown"
54
 
 
 
 
 
 
 
55
  # Analyze tone based on predefined categories
56
  def analyze_tone(text):
57
  detected_tones = set()
58
  for category, keywords in tone_categories.items():
59
- if any(word in text.lower() for word in keywords):
60
  detected_tones.add(category)
61
 
62
  if not detected_tones:
63
- tone_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
64
  model_result = tone_model(text, candidate_labels=list(tone_categories.keys()))
65
  detected_tones.update(model_result["labels"][:2])
66
 
67
  return list(detected_tones)
68
 
69
- # Extract hashtags
70
- def extract_hashtags(text):
71
- return re.findall(r"#\w+", text)
72
-
73
- # Extract frames based on predefined categories
74
  def extract_frames(text):
75
  detected_frames = set()
76
  for category, keywords in frame_categories.items():
77
- if any(word in text.lower() for word in keywords):
78
  detected_frames.add(category)
79
 
80
  if not detected_frames:
81
- frame_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
82
  model_result = frame_model(text, candidate_labels=list(frame_categories.keys()))
83
- detected_frames.update(model_result["labels"][:2])
 
 
84
 
85
- return list(detected_frames)
 
 
86
 
87
- # Extract captions from DOCX file based on "Post X"
88
  def extract_captions_from_docx(docx_file):
89
  doc = Document(docx_file)
90
  captions = {}
@@ -99,7 +116,7 @@ def extract_captions_from_docx(docx_file):
99
 
100
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
101
 
102
- # Generate a DOCX file in-memory with full captions
103
  def generate_docx(output_data):
104
  doc = Document()
105
  doc.add_heading('Activism Message Analysis', 0)
@@ -125,7 +142,7 @@ def generate_docx(output_data):
125
  return doc_io
126
 
127
  # Streamlit app
128
- st.title('AI-Powered Activism Message Analyzer with Intersectionality')
129
 
130
  st.write("Enter the text to analyze or upload a DOCX file containing captions:")
131
 
@@ -198,4 +215,4 @@ if output_data:
198
  data=docx_file,
199
  file_name="activism_message_analysis.docx",
200
  mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
201
- )
 
3
  from langdetect import detect
4
  from transformers import pipeline
5
  import nltk
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.stem import WordNetLemmatizer
8
  from docx import Document
9
  import io
10
 
11
  # Download required NLTK resources
12
  nltk.download('punkt')
13
+ nltk.download('wordnet')
14
+
15
+ # Initialize Lemmatizer
16
+ lemmatizer = WordNetLemmatizer()
17
+
18
+ # Cache model to avoid reloading on every function call
19
+ @st.cache_resource
20
+ def load_pipeline():
21
+ return pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
22
+
23
+ tone_model = load_pipeline()
24
+ frame_model = load_pipeline()
25
 
26
  # Updated tone categories
27
  tone_categories = {
 
39
  "Hopeful": ["progress", "unity", "hope", "victory", "together", "solidarity"]
40
  }
41
 
42
+ # Updated frame categories (Limited to 4 selections)
43
  frame_categories = {
44
  "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
45
  "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
 
58
  "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
59
  }
60
 
61
+ # Language detection
62
  def detect_language(text):
63
  try:
64
  return detect(text)
65
+ except Exception:
 
66
  return "unknown"
67
 
68
+ # NLP-based keyword matching with lemmatization
69
+ def contains_keywords(text, keywords):
70
+ words = word_tokenize(text.lower())
71
+ lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
72
+ return any(keyword in lemmatized_words for keyword in keywords)
73
+
74
  # Analyze tone based on predefined categories
75
  def analyze_tone(text):
76
  detected_tones = set()
77
  for category, keywords in tone_categories.items():
78
+ if contains_keywords(text, keywords):
79
  detected_tones.add(category)
80
 
81
  if not detected_tones:
 
82
  model_result = tone_model(text, candidate_labels=list(tone_categories.keys()))
83
  detected_tones.update(model_result["labels"][:2])
84
 
85
  return list(detected_tones)
86
 
87
+ # Extract frames based on predefined categories (Limit to 4)
 
 
 
 
88
  def extract_frames(text):
89
  detected_frames = set()
90
  for category, keywords in frame_categories.items():
91
+ if contains_keywords(text, keywords):
92
  detected_frames.add(category)
93
 
94
  if not detected_frames:
 
95
  model_result = frame_model(text, candidate_labels=list(frame_categories.keys()))
96
+ detected_frames.update(model_result["labels"][:4])
97
+
98
+ return list(detected_frames)[:4] # Ensure no more than 4 frames are selected
99
 
100
+ # Extract hashtags
101
+ def extract_hashtags(text):
102
+ return re.findall(r"#\w+", text)
103
 
104
+ # Extract captions from DOCX file
105
  def extract_captions_from_docx(docx_file):
106
  doc = Document(docx_file)
107
  captions = {}
 
116
 
117
  return {post: " ".join(lines) for post, lines in captions.items() if lines}
118
 
119
+ # Generate a DOCX file in-memory
120
  def generate_docx(output_data):
121
  doc = Document()
122
  doc.add_heading('Activism Message Analysis', 0)
 
142
  return doc_io
143
 
144
  # Streamlit app
145
+ st.title('AI-Powered Activism Message Analyzer')
146
 
147
  st.write("Enter the text to analyze or upload a DOCX file containing captions:")
148
 
 
215
  data=docx_file,
216
  file_name="activism_message_analysis.docx",
217
  mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
218
+ )