Sentiment analysis using pretrained models
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +12 -0
- .gitattributes +2 -0
- .slugignore +8 -0
- Dockerfile +68 -0
- Procfile +1 -0
- app.py +263 -0
- instance/sentiment_data.db +0 -0
- nltk_data/corpora/wordnet.zip +3 -0
- nltk_data/taggers/averaged_perceptron_tagger.zip +3 -0
- nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +3 -0
- nltk_data/tokenizers/punkt.zip +3 -0
- nltk_data/tokenizers/punkt/.DS_Store +0 -0
- nltk_data/tokenizers/punkt/PY3/README +98 -0
- nltk_data/tokenizers/punkt/PY3/czech.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/danish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/dutch.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/estonian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/finnish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/french.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/german.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/greek.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/italian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/malayalam.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/norwegian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/polish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/portuguese.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/russian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/slovene.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/spanish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/swedish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/turkish.pickle +3 -0
- nltk_data/tokenizers/punkt/README +98 -0
- nltk_data/tokenizers/punkt/czech.pickle +3 -0
- nltk_data/tokenizers/punkt/danish.pickle +3 -0
- nltk_data/tokenizers/punkt/dutch.pickle +3 -0
- nltk_data/tokenizers/punkt/english.pickle +3 -0
- nltk_data/tokenizers/punkt/estonian.pickle +3 -0
- nltk_data/tokenizers/punkt/finnish.pickle +3 -0
- nltk_data/tokenizers/punkt/french.pickle +3 -0
- nltk_data/tokenizers/punkt/german.pickle +3 -0
- nltk_data/tokenizers/punkt/greek.pickle +3 -0
- nltk_data/tokenizers/punkt/italian.pickle +3 -0
- nltk_data/tokenizers/punkt/malayalam.pickle +3 -0
- nltk_data/tokenizers/punkt/norwegian.pickle +3 -0
- nltk_data/tokenizers/punkt/polish.pickle +3 -0
- nltk_data/tokenizers/punkt/portuguese.pickle +3 -0
- nltk_data/tokenizers/punkt/russian.pickle +3 -0
- nltk_data/tokenizers/punkt/slovene.pickle +3 -0
- nltk_data/tokenizers/punkt/spanish.pickle +3 -0
.dockerignore
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.pyc
|
3 |
+
*.pyo
|
4 |
+
*.pyd
|
5 |
+
*.sqlite3
|
6 |
+
instance/
|
7 |
+
.git
|
8 |
+
.gitignore
|
9 |
+
.env
|
10 |
+
*.log
|
11 |
+
before.zip
|
12 |
+
wordcloud.png
|
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
static/nlpa.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
static/wordcloud.png filter=lfs diff=lfs merge=lfs -text
|
.slugignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.git
|
2 |
+
__pycache__
|
3 |
+
*.pyc
|
4 |
+
node_modules/
|
5 |
+
tests/
|
6 |
+
*.log
|
7 |
+
nltk_data/
|
8 |
+
|
Dockerfile
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# syntax=docker/dockerfile:1
|
2 |
+
|
3 |
+
FROM python:3.10-slim
|
4 |
+
|
5 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
6 |
+
PYTHONUNBUFFERED=1 \
|
7 |
+
PIP_NO_CACHE_DIR=1 \
|
8 |
+
PORT=7860 \
|
9 |
+
HF_HOME=/app/.cache/huggingface \
|
10 |
+
NLTK_DATA=/app/nltk_data \
|
11 |
+
MPLCONFIGDIR=/app/.config/matplotlib
|
12 |
+
|
13 |
+
# System deps (build tools and libs for pillow/wordcloud)
|
14 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
15 |
+
build-essential \
|
16 |
+
gcc \
|
17 |
+
g++ \
|
18 |
+
libjpeg-dev \
|
19 |
+
zlib1g-dev \
|
20 |
+
libpng-dev \
|
21 |
+
&& rm -rf /var/lib/apt/lists/*
|
22 |
+
|
23 |
+
WORKDIR /app
|
24 |
+
|
25 |
+
# Install Python deps first (better layer caching)
|
26 |
+
COPY requirements.txt ./
|
27 |
+
RUN pip install --upgrade pip && \
|
28 |
+
pip install -r requirements.txt
|
29 |
+
|
30 |
+
# Prepare writable caches
|
31 |
+
RUN mkdir -p ${HF_HOME} ${NLTK_DATA} ${MPLCONFIGDIR}
|
32 |
+
|
33 |
+
# Copy application code
|
34 |
+
COPY . .
|
35 |
+
|
36 |
+
# Ensure writable permissions for runtime (Spaces/K8s non-root scenarios)
|
37 |
+
RUN chmod -R 777 /app
|
38 |
+
|
39 |
+
# Run postbuild (e.g., install spaCy model) if present
|
40 |
+
RUN if [ -f postbuild ]; then sh postbuild; else python -m spacy download en_core_web_md; fi
|
41 |
+
|
42 |
+
# Pre-download NLTK data to writable dir
|
43 |
+
RUN python - <<'PY'
|
44 |
+
import nltk, os
|
45 |
+
os.makedirs(os.environ.get('NLTK_DATA','/app/nltk_data'), exist_ok=True)
|
46 |
+
for pkg in ['punkt','punkt_tab','wordnet','averaged_perceptron_tagger']:
|
47 |
+
try:
|
48 |
+
nltk.download(pkg, download_dir=os.environ['NLTK_DATA'])
|
49 |
+
except Exception as e:
|
50 |
+
print('NLTK download failed for', pkg, e)
|
51 |
+
PY
|
52 |
+
|
53 |
+
# Preload HF transformer models to writable cache
|
54 |
+
RUN python - <<'PY'
|
55 |
+
from transformers import pipeline
|
56 |
+
# DistilBERT SST-2
|
57 |
+
pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
|
58 |
+
# RoBERTa Twitter
|
59 |
+
pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment')
|
60 |
+
# Emotion model
|
61 |
+
pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base')
|
62 |
+
PY
|
63 |
+
|
64 |
+
# Expose default port (can be overridden by $PORT)
|
65 |
+
EXPOSE 7860
|
66 |
+
|
67 |
+
# Start the app using gunicorn (respects $PORT)
|
68 |
+
CMD ["sh", "-c", "gunicorn -b 0.0.0.0:${PORT:-7860} app:app"]
|
Procfile
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
web: gunicorn app:app
|
app.py
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, render_template, make_response
|
2 |
+
from flask_sqlalchemy import SQLAlchemy
|
3 |
+
from sentiment_model import preprocess_text, analyze_sentiment, read_file
|
4 |
+
from wordcloud import WordCloud
|
5 |
+
import os
|
6 |
+
import nltk
|
7 |
+
|
8 |
+
# Ensure NLTK uses a writable directory inside the container
|
9 |
+
NLTK_DIR = os.environ.get('NLTK_DATA', os.path.join(os.getcwd(), 'nltk_data'))
|
10 |
+
os.makedirs(NLTK_DIR, exist_ok=True)
|
11 |
+
if NLTK_DIR not in nltk.data.path:
|
12 |
+
nltk.data.path.insert(0, NLTK_DIR)
|
13 |
+
|
14 |
+
# Download required NLTK resources to the writable dir (no-op if present)
|
15 |
+
for pkg in ['punkt', 'punkt_tab', 'wordnet', 'averaged_perceptron_tagger']:
|
16 |
+
try:
|
17 |
+
nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)
|
18 |
+
except Exception:
|
19 |
+
pass
|
20 |
+
|
21 |
+
app = Flask(__name__, static_folder='static')
|
22 |
+
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///sentiment_data.db'
|
23 |
+
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
|
24 |
+
db = SQLAlchemy(app)
|
25 |
+
|
26 |
+
# Define SentimentRecord model
|
27 |
+
class SentimentRecord(db.Model):
|
28 |
+
id = db.Column(db.Integer, primary_key=True)
|
29 |
+
original_text = db.Column(db.Text, nullable=False)
|
30 |
+
cleaned_text = db.Column(db.Text, nullable=False)
|
31 |
+
removed_text = db.Column(db.Text, nullable=False)
|
32 |
+
normalized_text = db.Column(db.Text, nullable=False)
|
33 |
+
tokenized_text = db.Column(db.Text, nullable=False)
|
34 |
+
stemmed_text = db.Column(db.Text, nullable=False)
|
35 |
+
lemmatized_text = db.Column(db.Text, nullable=False)
|
36 |
+
sentiment = db.Column(db.String(20), nullable=False)
|
37 |
+
ner = db.Column(db.Text, nullable=False)
|
38 |
+
pos = db.Column(db.Text, nullable=False)
|
39 |
+
|
40 |
+
with app.app_context():
|
41 |
+
db.create_all()
|
42 |
+
|
43 |
+
# Global variables to store the analysis result
|
44 |
+
analysis_result = {}
|
45 |
+
|
46 |
+
@app.route('/')
|
47 |
+
def home():
|
48 |
+
return render_template('index.html',
|
49 |
+
sentiment=None,
|
50 |
+
text=None,
|
51 |
+
file_uploaded=None,
|
52 |
+
model_type='default')
|
53 |
+
|
54 |
+
@app.route('/analyze', methods=['POST'])
|
55 |
+
def analyze():
|
56 |
+
global analysis_result # To store the results globally for the download
|
57 |
+
text = request.form.get('text', '').strip()
|
58 |
+
file = request.files.get('file')
|
59 |
+
model_type = request.form.get('model_type', 'default')
|
60 |
+
|
61 |
+
file_uploaded = False
|
62 |
+
if file and file.filename != '':
|
63 |
+
text = read_file(file)
|
64 |
+
file_uploaded = True
|
65 |
+
|
66 |
+
if not text or len(text.split()) < 4:
|
67 |
+
return render_template('index.html',
|
68 |
+
error='Please provide at least 4 words for analysis.',
|
69 |
+
text=text,
|
70 |
+
model_type=model_type,
|
71 |
+
file_uploaded=file_uploaded)
|
72 |
+
|
73 |
+
word_count = len(text.split())
|
74 |
+
if word_count > 300:
|
75 |
+
return render_template('index.html',
|
76 |
+
error='Input text exceeds the 300-word limit.',
|
77 |
+
text=text,
|
78 |
+
model_type=model_type,
|
79 |
+
file_uploaded=file_uploaded)
|
80 |
+
|
81 |
+
try:
|
82 |
+
# Step 1: Preprocess text (cleaning, normalization, etc.)
|
83 |
+
cleaned_text, removed_text, normalized_text, tokenized_text, stemmed_text, lemmatized_text, ner, pos = preprocess_text(text)
|
84 |
+
|
85 |
+
# Step 2: Use lemmatized text for sentiment analysis
|
86 |
+
lemmatized_text_joined = " ".join(lemmatized_text)
|
87 |
+
sentiment, probabilities = analyze_sentiment(lemmatized_text_joined, model_type=model_type)
|
88 |
+
|
89 |
+
# Word-level sentiment analysis
|
90 |
+
neutral_words, positive_words, negative_words = [], [], []
|
91 |
+
|
92 |
+
if model_type != 'emotion':
|
93 |
+
for word in lemmatized_text:
|
94 |
+
word_sentiment, _ = analyze_sentiment(word, model_type=model_type)
|
95 |
+
if word_sentiment == 'POSITIVE':
|
96 |
+
positive_words.append(word)
|
97 |
+
elif word_sentiment == 'NEGATIVE':
|
98 |
+
negative_words.append(word)
|
99 |
+
elif word_sentiment == 'NEUTRAL':
|
100 |
+
neutral_words.append(word)
|
101 |
+
|
102 |
+
word_sentiment_distribution = {
|
103 |
+
'positive': len(positive_words),
|
104 |
+
'neutral': len(neutral_words),
|
105 |
+
'negative': len(negative_words)
|
106 |
+
}
|
107 |
+
else:
|
108 |
+
# Emotion model word-level sentiment analysis
|
109 |
+
emotion_counters = {
|
110 |
+
'ANGER': 0, 'DISGUST': 0, 'FEAR': 0, 'JOY': 0, 'NEUTRAL': 0, 'SADNESS': 0, 'SURPRISE': 0
|
111 |
+
}
|
112 |
+
emotion_words = {
|
113 |
+
'ANGER': [], 'DISGUST': [], 'FEAR': [], 'JOY': [], 'NEUTRAL': [], 'SADNESS': [], 'SURPRISE': []
|
114 |
+
}
|
115 |
+
for word in lemmatized_text:
|
116 |
+
word_sentiment, _ = analyze_sentiment(word, model_type=model_type)
|
117 |
+
if word_sentiment in emotion_counters:
|
118 |
+
emotion_counters[word_sentiment] += 1
|
119 |
+
emotion_words[word_sentiment].append(word)
|
120 |
+
|
121 |
+
word_sentiment_distribution = {
|
122 |
+
'anger': emotion_counters['ANGER'],
|
123 |
+
'disgust': emotion_counters['DISGUST'],
|
124 |
+
'fear': emotion_counters['FEAR'],
|
125 |
+
'joy': emotion_counters['JOY'],
|
126 |
+
'neutral': emotion_counters['NEUTRAL'],
|
127 |
+
'sadness': emotion_counters['SADNESS'],
|
128 |
+
'surprise': emotion_counters['SURPRISE']
|
129 |
+
}
|
130 |
+
|
131 |
+
# Store the analysis result in global variable for download
|
132 |
+
analysis_result = {
|
133 |
+
'sentiment': sentiment,
|
134 |
+
'model_type': model_type,
|
135 |
+
'cleaned_text': cleaned_text,
|
136 |
+
'removed_text': removed_text,
|
137 |
+
'normalized_text': normalized_text,
|
138 |
+
'tokenized_text': tokenized_text,
|
139 |
+
'stemmed_text': stemmed_text,
|
140 |
+
'lemmatized_text': lemmatized_text,
|
141 |
+
'ner': ner,
|
142 |
+
'pos': pos,
|
143 |
+
'original_text': text,
|
144 |
+
'word_sentiment_distribution': word_sentiment_distribution,
|
145 |
+
'positive_words': positive_words,
|
146 |
+
'negative_words': negative_words,
|
147 |
+
'neutral_words': neutral_words if model_type != 'emotion' else [],
|
148 |
+
'emotion_words': emotion_words if model_type == 'emotion' else None
|
149 |
+
}
|
150 |
+
|
151 |
+
# Generate Word Cloud
|
152 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text_joined)
|
153 |
+
wordcloud_path = os.path.join('static', 'wordcloud.png')
|
154 |
+
wordcloud.to_file(wordcloud_path)
|
155 |
+
|
156 |
+
return render_template('index.html',
|
157 |
+
sentiment=sentiment,
|
158 |
+
cleaned_text=cleaned_text,
|
159 |
+
removed_text=removed_text,
|
160 |
+
normalized_text=normalized_text,
|
161 |
+
tokenized_text=tokenized_text,
|
162 |
+
stemmed_text=" ".join(stemmed_text),
|
163 |
+
lemmatized_text=" ".join(lemmatized_text),
|
164 |
+
ner=ner,
|
165 |
+
pos=pos,
|
166 |
+
probabilities=probabilities,
|
167 |
+
wordcloud_url=wordcloud_path,
|
168 |
+
word_sentiment_distribution=word_sentiment_distribution,
|
169 |
+
positive_words=positive_words,
|
170 |
+
negative_words=negative_words,
|
171 |
+
neutral_words=neutral_words if model_type != 'emotion' else [],
|
172 |
+
emotion_words=emotion_words if model_type == 'emotion' else None,
|
173 |
+
text=text,
|
174 |
+
model_type=model_type,
|
175 |
+
total_words=len(tokenized_text),
|
176 |
+
file_uploaded=file_uploaded)
|
177 |
+
|
178 |
+
except Exception as e:
|
179 |
+
print(f"Error: {e}")
|
180 |
+
return render_template('index.html',
|
181 |
+
error='An error occurred during analysis.',
|
182 |
+
text=text,
|
183 |
+
model_type=model_type,
|
184 |
+
file_uploaded=file_uploaded)
|
185 |
+
|
186 |
+
@app.route('/download')
|
187 |
+
def download_result():
|
188 |
+
global analysis_result
|
189 |
+
try:
|
190 |
+
if not analysis_result:
|
191 |
+
return "No analysis available for download", 400
|
192 |
+
|
193 |
+
# Build content for the TXT file
|
194 |
+
content = f"""
|
195 |
+
Sentiment
|
196 |
+
Overall Sentiment: {analysis_result['sentiment']}
|
197 |
+
|
198 |
+
Model Used
|
199 |
+
Selected Model: {analysis_result['model_type']}
|
200 |
+
|
201 |
+
Original Text:
|
202 |
+
{analysis_result['original_text']}
|
203 |
+
|
204 |
+
Text Preprocessing Results
|
205 |
+
Cleaned Text:
|
206 |
+
{analysis_result['cleaned_text']}
|
207 |
+
|
208 |
+
Removed Text:
|
209 |
+
{analysis_result['removed_text']}
|
210 |
+
|
211 |
+
Normalized Text:
|
212 |
+
{analysis_result['normalized_text']}
|
213 |
+
|
214 |
+
Tokenized Text:
|
215 |
+
{', '.join(analysis_result['tokenized_text'])}
|
216 |
+
|
217 |
+
Stemmed Text:
|
218 |
+
{" ".join(analysis_result['stemmed_text'])}
|
219 |
+
|
220 |
+
Lemmatized Text:
|
221 |
+
{" ".join(analysis_result['lemmatized_text'])}
|
222 |
+
|
223 |
+
Named Entities (NER):
|
224 |
+
{', '.join([f"{entity[0]} ({entity[1]})" for entity in analysis_result['ner']])}
|
225 |
+
|
226 |
+
POS Tags:
|
227 |
+
{', '.join([f"{word} ({tag})" for word, tag in analysis_result['pos']])}
|
228 |
+
|
229 |
+
Total Words: {len(analysis_result['tokenized_text'])}
|
230 |
+
|
231 |
+
"""
|
232 |
+
# If the model is 'emotion', include emotion-based words
|
233 |
+
if analysis_result['model_type'] == 'emotion':
|
234 |
+
content += "\nEmotion-Specific Words:\n"
|
235 |
+
for emotion, words in analysis_result['emotion_words'].items():
|
236 |
+
content += f"{emotion.capitalize()} Words: {len(words)}\n"
|
237 |
+
content += f"{', '.join(words)}\n"
|
238 |
+
|
239 |
+
# Otherwise, include positive, neutral, and negative words for other models
|
240 |
+
else:
|
241 |
+
content += f"""
|
242 |
+
Positive Words: {len(analysis_result['positive_words'])}
|
243 |
+
{', '.join(analysis_result['positive_words'])}
|
244 |
+
|
245 |
+
Neutral Words: {len(analysis_result['neutral_words'])}
|
246 |
+
{', '.join(analysis_result['neutral_words'])}
|
247 |
+
|
248 |
+
Negative Words: {len(analysis_result['negative_words'])}
|
249 |
+
{', '.join(analysis_result['negative_words'])}
|
250 |
+
"""
|
251 |
+
|
252 |
+
# Create a response object with the content
|
253 |
+
response = make_response(content)
|
254 |
+
response.headers["Content-Disposition"] = "attachment; filename=sentiment_analysis_result.txt"
|
255 |
+
response.headers["Content-Type"] = "text/plain"
|
256 |
+
return response
|
257 |
+
except Exception as e:
|
258 |
+
print(f"Error during file download: {e}")
|
259 |
+
return "Error in generating file", 500
|
260 |
+
|
261 |
+
if __name__ == '__main__':
|
262 |
+
port = int(os.environ.get('PORT', 7860))
|
263 |
+
app.run(host='0.0.0.0', port=port)
|
instance/sentiment_data.db
ADDED
Binary file (8.19 kB). View file
|
|
nltk_data/corpora/wordnet.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cbda5ea6eef7f36a97a43d4a75f85e07fccbb4f23657d27b4ccbc93e2646ab59
|
3 |
+
size 10775600
|
nltk_data/taggers/averaged_perceptron_tagger.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e1f13cf2532daadfd6f3bc481a49859f0b8ea6432ccdcd83e6a49a5f19008de9
|
3 |
+
size 2526731
|
nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:25a5a19c7ced7b2bac3831da5bc0afcc2c34e5dd01cd4f361bb799949a696238
|
3 |
+
size 6138625
|
nltk_data/tokenizers/punkt.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec
|
3 |
+
size 13905355
|
nltk_data/tokenizers/punkt/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
nltk_data/tokenizers/punkt/PY3/README
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
2 |
+
|
3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
5 |
+
|
6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
8 |
+
and chapter 3.8 of the NLTK book:
|
9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
10 |
+
|
11 |
+
There are pretrained tokenizers for the following languages:
|
12 |
+
|
13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
14 |
+
=======================================================================================================================================================================
|
15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
16 |
+
Literarni Noviny
|
17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
24 |
+
(American)
|
25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
29 |
+
Text Bank (Suomen Kielen newspapers
|
30 |
+
Tekstipankki)
|
31 |
+
Finnish Center for IT Science
|
32 |
+
(CSC)
|
33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
35 |
+
(European)
|
36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
38 |
+
(Switzerland) CD-ROM
|
39 |
+
(Uses "ss"
|
40 |
+
instead of "ß")
|
41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
47 |
+
(Bokmål and Information Technologies,
|
48 |
+
Nynorsk) Bergen
|
49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
51 |
+
(http://www.nkjp.pl/)
|
52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
54 |
+
(Brazilian) (Linguateca)
|
55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
57 |
+
Slovene Academy for Arts
|
58 |
+
and Sciences
|
59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
61 |
+
(European)
|
62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
64 |
+
(and some other texts)
|
65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
67 |
+
(Türkçe Derlem Projesi)
|
68 |
+
University of Ankara
|
69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
70 |
+
|
71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
72 |
+
Unicode using the codecs module.
|
73 |
+
|
74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
75 |
+
Computational Linguistics 32: 485-525.
|
76 |
+
|
77 |
+
---- Training Code ----
|
78 |
+
|
79 |
+
# import punkt
|
80 |
+
import nltk.tokenize.punkt
|
81 |
+
|
82 |
+
# Make a new Tokenizer
|
83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
84 |
+
|
85 |
+
# Read in training corpus (one example: Slovene)
|
86 |
+
import codecs
|
87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
88 |
+
|
89 |
+
# Train tokenizer
|
90 |
+
tokenizer.train(text)
|
91 |
+
|
92 |
+
# Dump pickled tokenizer
|
93 |
+
import pickle
|
94 |
+
out = open("slovene.pickle","wb")
|
95 |
+
pickle.dump(tokenizer, out)
|
96 |
+
out.close()
|
97 |
+
|
98 |
+
---------
|
nltk_data/tokenizers/punkt/PY3/czech.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64b0734b6fbe8e8d7cac79f48d1dd9f853824e57c4e3594dadd74ba2c1d97f50
|
3 |
+
size 1119050
|
nltk_data/tokenizers/punkt/PY3/danish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6189c7dd254e29e2bd406a7f6a4336297c8953214792466a790ea4444223ceb3
|
3 |
+
size 1191710
|
nltk_data/tokenizers/punkt/PY3/dutch.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fda0d6a13f02e8898daec7fe923da88e25abe081bcfa755c0e015075c215fe4c
|
3 |
+
size 693759
|
nltk_data/tokenizers/punkt/PY3/english.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
|
3 |
+
size 406697
|
nltk_data/tokenizers/punkt/PY3/estonian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b364f72538d17b146a98009ad239a8096ce6c0a8b02958c0bc776ecd0c58a25f
|
3 |
+
size 1499502
|
nltk_data/tokenizers/punkt/PY3/finnish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a4b5ff5500ee851c456f9dd40d5fc0d8c1859c88eb3178de1317d26b7d22833
|
3 |
+
size 1852226
|
nltk_data/tokenizers/punkt/PY3/french.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28e3a4cd2971989b3cb9fd3433a6f15d17981e464db2be039364313b5de94f29
|
3 |
+
size 553575
|
nltk_data/tokenizers/punkt/PY3/german.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ddcbbe85e2042a019b1a6e37fd8c153286c38ba201fae0f5bfd9a3f74abae25c
|
3 |
+
size 1463575
|
nltk_data/tokenizers/punkt/PY3/greek.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:85dabc44ab90a5f208ef37ff6b4892ebe7e740f71fb4da47cfd95417ca3e22fd
|
3 |
+
size 876006
|
nltk_data/tokenizers/punkt/PY3/italian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68a94007b1e4ffdc4d1a190185ca5442c3dafeb17ab39d30329e84cd74a43947
|
3 |
+
size 615089
|
nltk_data/tokenizers/punkt/PY3/malayalam.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
|
3 |
+
size 221207
|
nltk_data/tokenizers/punkt/PY3/norwegian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ff7a46d1438b311457d15d7763060b8d3270852c1850fd788c5cee194dc4a1d
|
3 |
+
size 1181271
|
nltk_data/tokenizers/punkt/PY3/polish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:624900ae3ddfb4854a98c5d3b8b1c9bb719975f33fee61ce1441dab9f8a00718
|
3 |
+
size 1738386
|
nltk_data/tokenizers/punkt/PY3/portuguese.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02a0b7b25c3c7471e1791b66a31bbb530afbb0160aee4fcecf0107652067b4a1
|
3 |
+
size 611919
|
nltk_data/tokenizers/punkt/PY3/russian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e
|
3 |
+
size 33020
|
nltk_data/tokenizers/punkt/PY3/slovene.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52ef2cc0ed27d79b3aa635cbbc40ad811883a75a4b8a8be1ae406972870fd864
|
3 |
+
size 734444
|
nltk_data/tokenizers/punkt/PY3/spanish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:164a50fadc5a49f8ec7426eae11d3111ee752b48a3ef373d47745011192a5984
|
3 |
+
size 562337
|
nltk_data/tokenizers/punkt/PY3/swedish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0f7d538bfd5266633b09e842cd92e9e0ac10f1d923bf211e1497972ddc47318
|
3 |
+
size 979681
|
nltk_data/tokenizers/punkt/PY3/turkish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae68ef5863728ac5332e87eb1f6bae772ff32a13a4caa2b01a5c68103e853c5b
|
3 |
+
size 1017038
|
nltk_data/tokenizers/punkt/README
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
2 |
+
|
3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
5 |
+
|
6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
8 |
+
and chapter 3.8 of the NLTK book:
|
9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
10 |
+
|
11 |
+
There are pretrained tokenizers for the following languages:
|
12 |
+
|
13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
14 |
+
=======================================================================================================================================================================
|
15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
16 |
+
Literarni Noviny
|
17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
24 |
+
(American)
|
25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
29 |
+
Text Bank (Suomen Kielen newspapers
|
30 |
+
Tekstipankki)
|
31 |
+
Finnish Center for IT Science
|
32 |
+
(CSC)
|
33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
35 |
+
(European)
|
36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
38 |
+
(Switzerland) CD-ROM
|
39 |
+
(Uses "ss"
|
40 |
+
instead of "ß")
|
41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
47 |
+
(Bokmål and Information Technologies,
|
48 |
+
Nynorsk) Bergen
|
49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
51 |
+
(http://www.nkjp.pl/)
|
52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
54 |
+
(Brazilian) (Linguateca)
|
55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
57 |
+
Slovene Academy for Arts
|
58 |
+
and Sciences
|
59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
61 |
+
(European)
|
62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
64 |
+
(and some other texts)
|
65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
67 |
+
(Türkçe Derlem Projesi)
|
68 |
+
University of Ankara
|
69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
70 |
+
|
71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
72 |
+
Unicode using the codecs module.
|
73 |
+
|
74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
75 |
+
Computational Linguistics 32: 485-525.
|
76 |
+
|
77 |
+
---- Training Code ----
|
78 |
+
|
79 |
+
# import punkt
|
80 |
+
import nltk.tokenize.punkt
|
81 |
+
|
82 |
+
# Make a new Tokenizer
|
83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
84 |
+
|
85 |
+
# Read in training corpus (one example: Slovene)
|
86 |
+
import codecs
|
87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
88 |
+
|
89 |
+
# Train tokenizer
|
90 |
+
tokenizer.train(text)
|
91 |
+
|
92 |
+
# Dump pickled tokenizer
|
93 |
+
import pickle
|
94 |
+
out = open("slovene.pickle","wb")
|
95 |
+
pickle.dump(tokenizer, out)
|
96 |
+
out.close()
|
97 |
+
|
98 |
+
---------
|
nltk_data/tokenizers/punkt/czech.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ba73d293c7d7953956bcf02f3695ec5c1f0d527f2a3c38097f5593394fa1690
|
3 |
+
size 1265552
|
nltk_data/tokenizers/punkt/danish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea29760a0a9197f52ca59e78aeafc5a6f55d05258faf7db1709b2b9eb321ef20
|
3 |
+
size 1264725
|
nltk_data/tokenizers/punkt/dutch.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a8e26b3d68c45c38e594d19e2d5677447bfdcaa636d3b1e7acfed0e9272d73c
|
3 |
+
size 742624
|
nltk_data/tokenizers/punkt/english.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dda37972ae88998a6fd3e3ec002697a6bd362b32d050fda7d7ca5276873092aa
|
3 |
+
size 433305
|
nltk_data/tokenizers/punkt/estonian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3867fee26a36bdb197c64362aa13ac683f5f33fa4d0d225a5d56707582a55a1d
|
3 |
+
size 1596714
|
nltk_data/tokenizers/punkt/finnish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1a9e17b3d5b4df76345d812b8a65b1da0767eda5086eadcc11e625eef0942835
|
3 |
+
size 1951656
|
nltk_data/tokenizers/punkt/french.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de05f3d5647d3d2296626fb83f68428e4c6ad6e05a00ed4694c8bdc8f2f197ee
|
3 |
+
size 583482
|
nltk_data/tokenizers/punkt/german.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eab497fa085413130c8fd0fb13b929128930afe2f6a26ea8715c95df7088e97c
|
3 |
+
size 1526714
|
nltk_data/tokenizers/punkt/greek.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:21752a6762fad5cfe46fb5c45fad9a85484a0e8e81c67e6af6fb973cfc27d67c
|
3 |
+
size 1953106
|
nltk_data/tokenizers/punkt/italian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dcb2717d7be5f26e860a92e05acf69b1123a5f4527cd7a269a9ab9e9e668c805
|
3 |
+
size 658331
|
nltk_data/tokenizers/punkt/malayalam.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
|
3 |
+
size 221207
|
nltk_data/tokenizers/punkt/norwegian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e4a97f8f9a03a0338dd746bcc89a0ae0f54ae43b835fa37d83e279e1ca794faf
|
3 |
+
size 1259779
|
nltk_data/tokenizers/punkt/polish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16127b6d10933427a3e90fb20e9be53e1fb371ff79a730c1030734ed80b90c92
|
3 |
+
size 2042451
|
nltk_data/tokenizers/punkt/portuguese.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb01bf7c79a4eadc2178bbd209665139a0e4b38f2d1c44fef097de93955140e0
|
3 |
+
size 649051
|
nltk_data/tokenizers/punkt/russian.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909
|
3 |
+
size 33027
|
nltk_data/tokenizers/punkt/slovene.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7dac650212b3787b39996c01bd2084115493e6f6ec390bab61f767525b08b8ea
|
3 |
+
size 832867
|
nltk_data/tokenizers/punkt/spanish.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:271dc6027c4aae056f72a9bfab5645cf67e198bf4f972895844e40f5989ccdc3
|
3 |
+
size 597831
|