AA_Final1

Sleeping

App Files Files Community

ahm14 commited on Jan 29

Commit

7ddff49

verified ·

1 Parent(s): 706fc89

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -19

app.py CHANGED Viewed

@@ -3,11 +3,25 @@ import re
 from langdetect import detect
 from transformers import pipeline
 import nltk
 from docx import Document
 import io
 # Download required NLTK resources
 nltk.download('punkt')
 # Updated tone categories
 tone_categories = {
@@ -25,7 +39,7 @@ tone_categories = {
     "Hopeful": ["progress", "unity", "hope", "victory", "together", "solidarity"]
 }
-# Updated frame categories
 frame_categories = {
     "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
     "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
@@ -44,47 +58,50 @@ frame_categories = {
     "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
 }
-# Detect language
 def detect_language(text):
     try:
         return detect(text)
-    except Exception as e:
-        st.write(f"Error detecting language: {e}")
         return "unknown"
 # Analyze tone based on predefined categories
 def analyze_tone(text):
     detected_tones = set()
     for category, keywords in tone_categories.items():
-        if any(word in text.lower() for word in keywords):
             detected_tones.add(category)
     if not detected_tones:
-        tone_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
         model_result = tone_model(text, candidate_labels=list(tone_categories.keys()))
         detected_tones.update(model_result["labels"][:2])
     return list(detected_tones)
-# Extract hashtags
-def extract_hashtags(text):
-    return re.findall(r"#\w+", text)
-# Extract frames based on predefined categories
 def extract_frames(text):
     detected_frames = set()
     for category, keywords in frame_categories.items():
-        if any(word in text.lower() for word in keywords):
             detected_frames.add(category)
     if not detected_frames:
-        frame_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
         model_result = frame_model(text, candidate_labels=list(frame_categories.keys()))
-        detected_frames.update(model_result["labels"][:2])
-    return list(detected_frames)
-# Extract captions from DOCX file based on "Post X"
 def extract_captions_from_docx(docx_file):
     doc = Document(docx_file)
     captions = {}
@@ -99,7 +116,7 @@ def extract_captions_from_docx(docx_file):
     return {post: " ".join(lines) for post, lines in captions.items() if lines}
-# Generate a DOCX file in-memory with full captions
 def generate_docx(output_data):
     doc = Document()
     doc.add_heading('Activism Message Analysis', 0)
@@ -125,7 +142,7 @@ def generate_docx(output_data):
     return doc_io
 # Streamlit app
-st.title('AI-Powered Activism Message Analyzer with Intersectionality')
 st.write("Enter the text to analyze or upload a DOCX file containing captions:")
@@ -198,4 +215,4 @@ if output_data:
             data=docx_file,
             file_name="activism_message_analysis.docx",
             mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-        )

 from langdetect import detect
 from transformers import pipeline
 import nltk
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
 from docx import Document
 import io
 # Download required NLTK resources
 nltk.download('punkt')
+nltk.download('wordnet')
+# Initialize Lemmatizer
+lemmatizer = WordNetLemmatizer()
+# Cache model to avoid reloading on every function call
+@st.cache_resource
+def load_pipeline():
+    return pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+tone_model = load_pipeline()
+frame_model = load_pipeline()
 # Updated tone categories
 tone_categories = {
     "Hopeful": ["progress", "unity", "hope", "victory", "together", "solidarity"]
 }
+# Updated frame categories (Limited to 4 selections)
 frame_categories = {
     "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
     "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
     "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
 }
+# Language detection
 def detect_language(text):
     try:
         return detect(text)
+    except Exception:
         return "unknown"
+# NLP-based keyword matching with lemmatization
+def contains_keywords(text, keywords):
+    words = word_tokenize(text.lower())
+    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
+    return any(keyword in lemmatized_words for keyword in keywords)
 # Analyze tone based on predefined categories
 def analyze_tone(text):
     detected_tones = set()
     for category, keywords in tone_categories.items():
+        if contains_keywords(text, keywords):
             detected_tones.add(category)
     if not detected_tones:
         model_result = tone_model(text, candidate_labels=list(tone_categories.keys()))
         detected_tones.update(model_result["labels"][:2])
     return list(detected_tones)
+# Extract frames based on predefined categories (Limit to 4)
 def extract_frames(text):
     detected_frames = set()
     for category, keywords in frame_categories.items():
+        if contains_keywords(text, keywords):
             detected_frames.add(category)
     if not detected_frames:
         model_result = frame_model(text, candidate_labels=list(frame_categories.keys()))
+        detected_frames.update(model_result["labels"][:4])
+    return list(detected_frames)[:4]  # Ensure no more than 4 frames are selected
+# Extract hashtags
+def extract_hashtags(text):
+    return re.findall(r"#\w+", text)
+# Extract captions from DOCX file
 def extract_captions_from_docx(docx_file):
     doc = Document(docx_file)
     captions = {}
     return {post: " ".join(lines) for post, lines in captions.items() if lines}
+# Generate a DOCX file in-memory
 def generate_docx(output_data):
     doc = Document()
     doc.add_heading('Activism Message Analysis', 0)
     return doc_io
 # Streamlit app
+st.title('AI-Powered Activism Message Analyzer')
 st.write("Enter the text to analyze or upload a DOCX file containing captions:")
             data=docx_file,
             file_name="activism_message_analysis.docx",
             mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+        )