File size: 3,611 Bytes
bdfd7d2
 
 
 
 
 
 
 
 
 
 
026468b
 
dc61ab3
026468b
 
bdfd7d2
026468b
 
4f7472a
026468b
 
 
 
bdfd7d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import requests
from collections import Counter
from transformers import pipeline
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import os

nltk_data_dir = "/tmp/nltk_data"
hf_cache_dir = "/tmp/huggingface"

os.makedirs(nltk_data_dir, exist_ok=True)
os.makedirs(hf_cache_dir, exist_ok=True)

os.environ["NLTK_DATA"] = nltk_data_dir
os.environ["HF_HOME"] = hf_cache_dir

nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir)
nltk.download('punkt_tab', download_dir=nltk_data_dir)


# 1. Function for getting news via NewsAPI
def get_news(query, api_key, num_articles=5):
    url = f'https://newsapi.org/v2/everything?q={query}&apiKey={api_key}&language=en&pageSize={num_articles}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()['articles']
    return []


# 2. Analyzing tone with Hugging Face
tone_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", revision="714eb0f")

def analyze_sentiment(text):
    return tone_analyzer(text)[0]


# 3. Define category

category_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/tweet-topic-21-multi")
category_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/tweet-topic-21-multi")
labels = ['art', 'business', 'entertainment', 'environment', 'fashion', 'finance', 'food',
          'health', 'law', 'media', 'military', 'music', 'politics', 'religion', 'sci/tech',
          'sports', 'travel', 'weather', 'world news', 'none']

def classify_category(text):
    inputs = category_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = category_model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return labels[predicted_class]


# 4. Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def split_text(text, max_tokens=512):
    words = text.split()
    return [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]

def summarize_text(text):
    chunks = split_text(text)
    summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
    return ' '.join(summaries)


# 5. Search for trending words
def extract_trending_words(texts):
    text = ' '.join(texts).lower()
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english') and word not in string.punctuation and len(word) > 1]
    word_freq = Counter(words)
    return word_freq.most_common(10)

# 6. The main process of analyzing news
def analyze_news(query, api_key, num_articles=5):
    articles = get_news(query, api_key, num_articles)

    if not articles:
        return []

    news_results = []
    for article in articles:
        title = article.get('title', 'No Title')
        description = article.get('description', '') or ''
        url = article.get('url', '#')

        sentiment = analyze_sentiment(title + " " + description)['label']
        category = classify_category(title + " " + description)
        summary = summarize_text(title + " " + description)

        news_results.append({
            "title": title,
            "url": url,
            "sentiment": sentiment,
            "category": category,
            "summary": summary
        })

    return news_results