aradhyapavan commited on
Commit
7cb1242
·
verified ·
1 Parent(s): 0cca74f

Sentiment analysis using pretrained models

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +12 -0
  2. .gitattributes +2 -0
  3. .slugignore +8 -0
  4. Dockerfile +68 -0
  5. Procfile +1 -0
  6. app.py +263 -0
  7. instance/sentiment_data.db +0 -0
  8. nltk_data/corpora/wordnet.zip +3 -0
  9. nltk_data/taggers/averaged_perceptron_tagger.zip +3 -0
  10. nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +3 -0
  11. nltk_data/tokenizers/punkt.zip +3 -0
  12. nltk_data/tokenizers/punkt/.DS_Store +0 -0
  13. nltk_data/tokenizers/punkt/PY3/README +98 -0
  14. nltk_data/tokenizers/punkt/PY3/czech.pickle +3 -0
  15. nltk_data/tokenizers/punkt/PY3/danish.pickle +3 -0
  16. nltk_data/tokenizers/punkt/PY3/dutch.pickle +3 -0
  17. nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
  18. nltk_data/tokenizers/punkt/PY3/estonian.pickle +3 -0
  19. nltk_data/tokenizers/punkt/PY3/finnish.pickle +3 -0
  20. nltk_data/tokenizers/punkt/PY3/french.pickle +3 -0
  21. nltk_data/tokenizers/punkt/PY3/german.pickle +3 -0
  22. nltk_data/tokenizers/punkt/PY3/greek.pickle +3 -0
  23. nltk_data/tokenizers/punkt/PY3/italian.pickle +3 -0
  24. nltk_data/tokenizers/punkt/PY3/malayalam.pickle +3 -0
  25. nltk_data/tokenizers/punkt/PY3/norwegian.pickle +3 -0
  26. nltk_data/tokenizers/punkt/PY3/polish.pickle +3 -0
  27. nltk_data/tokenizers/punkt/PY3/portuguese.pickle +3 -0
  28. nltk_data/tokenizers/punkt/PY3/russian.pickle +3 -0
  29. nltk_data/tokenizers/punkt/PY3/slovene.pickle +3 -0
  30. nltk_data/tokenizers/punkt/PY3/spanish.pickle +3 -0
  31. nltk_data/tokenizers/punkt/PY3/swedish.pickle +3 -0
  32. nltk_data/tokenizers/punkt/PY3/turkish.pickle +3 -0
  33. nltk_data/tokenizers/punkt/README +98 -0
  34. nltk_data/tokenizers/punkt/czech.pickle +3 -0
  35. nltk_data/tokenizers/punkt/danish.pickle +3 -0
  36. nltk_data/tokenizers/punkt/dutch.pickle +3 -0
  37. nltk_data/tokenizers/punkt/english.pickle +3 -0
  38. nltk_data/tokenizers/punkt/estonian.pickle +3 -0
  39. nltk_data/tokenizers/punkt/finnish.pickle +3 -0
  40. nltk_data/tokenizers/punkt/french.pickle +3 -0
  41. nltk_data/tokenizers/punkt/german.pickle +3 -0
  42. nltk_data/tokenizers/punkt/greek.pickle +3 -0
  43. nltk_data/tokenizers/punkt/italian.pickle +3 -0
  44. nltk_data/tokenizers/punkt/malayalam.pickle +3 -0
  45. nltk_data/tokenizers/punkt/norwegian.pickle +3 -0
  46. nltk_data/tokenizers/punkt/polish.pickle +3 -0
  47. nltk_data/tokenizers/punkt/portuguese.pickle +3 -0
  48. nltk_data/tokenizers/punkt/russian.pickle +3 -0
  49. nltk_data/tokenizers/punkt/slovene.pickle +3 -0
  50. nltk_data/tokenizers/punkt/spanish.pickle +3 -0
.dockerignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ *.sqlite3
6
+ instance/
7
+ .git
8
+ .gitignore
9
+ .env
10
+ *.log
11
+ before.zip
12
+ wordcloud.png
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ static/nlpa.png filter=lfs diff=lfs merge=lfs -text
37
+ static/wordcloud.png filter=lfs diff=lfs merge=lfs -text
.slugignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ __pycache__
3
+ *.pyc
4
+ node_modules/
5
+ tests/
6
+ *.log
7
+ nltk_data/
8
+
Dockerfile ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+
3
+ FROM python:3.10-slim
4
+
5
+ ENV PYTHONDONTWRITEBYTECODE=1 \
6
+ PYTHONUNBUFFERED=1 \
7
+ PIP_NO_CACHE_DIR=1 \
8
+ PORT=7860 \
9
+ HF_HOME=/app/.cache/huggingface \
10
+ NLTK_DATA=/app/nltk_data \
11
+ MPLCONFIGDIR=/app/.config/matplotlib
12
+
13
+ # System deps (build tools and libs for pillow/wordcloud)
14
+ RUN apt-get update && apt-get install -y --no-install-recommends \
15
+ build-essential \
16
+ gcc \
17
+ g++ \
18
+ libjpeg-dev \
19
+ zlib1g-dev \
20
+ libpng-dev \
21
+ && rm -rf /var/lib/apt/lists/*
22
+
23
+ WORKDIR /app
24
+
25
+ # Install Python deps first (better layer caching)
26
+ COPY requirements.txt ./
27
+ RUN pip install --upgrade pip && \
28
+ pip install -r requirements.txt
29
+
30
+ # Prepare writable caches
31
+ RUN mkdir -p ${HF_HOME} ${NLTK_DATA} ${MPLCONFIGDIR}
32
+
33
+ # Copy application code
34
+ COPY . .
35
+
36
+ # Ensure writable permissions for runtime (Spaces/K8s non-root scenarios)
37
+ RUN chmod -R 777 /app
38
+
39
+ # Run postbuild (e.g., install spaCy model) if present
40
+ RUN if [ -f postbuild ]; then sh postbuild; else python -m spacy download en_core_web_md; fi
41
+
42
+ # Pre-download NLTK data to writable dir
43
+ RUN python - <<'PY'
44
+ import nltk, os
45
+ os.makedirs(os.environ.get('NLTK_DATA','/app/nltk_data'), exist_ok=True)
46
+ for pkg in ['punkt','punkt_tab','wordnet','averaged_perceptron_tagger']:
47
+ try:
48
+ nltk.download(pkg, download_dir=os.environ['NLTK_DATA'])
49
+ except Exception as e:
50
+ print('NLTK download failed for', pkg, e)
51
+ PY
52
+
53
+ # Preload HF transformer models to writable cache
54
+ RUN python - <<'PY'
55
+ from transformers import pipeline
56
+ # DistilBERT SST-2
57
+ pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
58
+ # RoBERTa Twitter
59
+ pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment')
60
+ # Emotion model
61
+ pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base')
62
+ PY
63
+
64
+ # Expose default port (can be overridden by $PORT)
65
+ EXPOSE 7860
66
+
67
+ # Start the app using gunicorn (respects $PORT)
68
+ CMD ["sh", "-c", "gunicorn -b 0.0.0.0:${PORT:-7860} app:app"]
Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: gunicorn app:app
app.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, render_template, make_response
2
+ from flask_sqlalchemy import SQLAlchemy
3
+ from sentiment_model import preprocess_text, analyze_sentiment, read_file
4
+ from wordcloud import WordCloud
5
+ import os
6
+ import nltk
7
+
8
+ # Ensure NLTK uses a writable directory inside the container
9
+ NLTK_DIR = os.environ.get('NLTK_DATA', os.path.join(os.getcwd(), 'nltk_data'))
10
+ os.makedirs(NLTK_DIR, exist_ok=True)
11
+ if NLTK_DIR not in nltk.data.path:
12
+ nltk.data.path.insert(0, NLTK_DIR)
13
+
14
+ # Download required NLTK resources to the writable dir (no-op if present)
15
+ for pkg in ['punkt', 'punkt_tab', 'wordnet', 'averaged_perceptron_tagger']:
16
+ try:
17
+ nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)
18
+ except Exception:
19
+ pass
20
+
21
+ app = Flask(__name__, static_folder='static')
22
+ app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///sentiment_data.db'
23
+ app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
24
+ db = SQLAlchemy(app)
25
+
26
+ # Define SentimentRecord model
27
+ class SentimentRecord(db.Model):
28
+ id = db.Column(db.Integer, primary_key=True)
29
+ original_text = db.Column(db.Text, nullable=False)
30
+ cleaned_text = db.Column(db.Text, nullable=False)
31
+ removed_text = db.Column(db.Text, nullable=False)
32
+ normalized_text = db.Column(db.Text, nullable=False)
33
+ tokenized_text = db.Column(db.Text, nullable=False)
34
+ stemmed_text = db.Column(db.Text, nullable=False)
35
+ lemmatized_text = db.Column(db.Text, nullable=False)
36
+ sentiment = db.Column(db.String(20), nullable=False)
37
+ ner = db.Column(db.Text, nullable=False)
38
+ pos = db.Column(db.Text, nullable=False)
39
+
40
+ with app.app_context():
41
+ db.create_all()
42
+
43
+ # Global variables to store the analysis result
44
+ analysis_result = {}
45
+
46
+ @app.route('/')
47
+ def home():
48
+ return render_template('index.html',
49
+ sentiment=None,
50
+ text=None,
51
+ file_uploaded=None,
52
+ model_type='default')
53
+
54
+ @app.route('/analyze', methods=['POST'])
55
+ def analyze():
56
+ global analysis_result # To store the results globally for the download
57
+ text = request.form.get('text', '').strip()
58
+ file = request.files.get('file')
59
+ model_type = request.form.get('model_type', 'default')
60
+
61
+ file_uploaded = False
62
+ if file and file.filename != '':
63
+ text = read_file(file)
64
+ file_uploaded = True
65
+
66
+ if not text or len(text.split()) < 4:
67
+ return render_template('index.html',
68
+ error='Please provide at least 4 words for analysis.',
69
+ text=text,
70
+ model_type=model_type,
71
+ file_uploaded=file_uploaded)
72
+
73
+ word_count = len(text.split())
74
+ if word_count > 300:
75
+ return render_template('index.html',
76
+ error='Input text exceeds the 300-word limit.',
77
+ text=text,
78
+ model_type=model_type,
79
+ file_uploaded=file_uploaded)
80
+
81
+ try:
82
+ # Step 1: Preprocess text (cleaning, normalization, etc.)
83
+ cleaned_text, removed_text, normalized_text, tokenized_text, stemmed_text, lemmatized_text, ner, pos = preprocess_text(text)
84
+
85
+ # Step 2: Use lemmatized text for sentiment analysis
86
+ lemmatized_text_joined = " ".join(lemmatized_text)
87
+ sentiment, probabilities = analyze_sentiment(lemmatized_text_joined, model_type=model_type)
88
+
89
+ # Word-level sentiment analysis
90
+ neutral_words, positive_words, negative_words = [], [], []
91
+
92
+ if model_type != 'emotion':
93
+ for word in lemmatized_text:
94
+ word_sentiment, _ = analyze_sentiment(word, model_type=model_type)
95
+ if word_sentiment == 'POSITIVE':
96
+ positive_words.append(word)
97
+ elif word_sentiment == 'NEGATIVE':
98
+ negative_words.append(word)
99
+ elif word_sentiment == 'NEUTRAL':
100
+ neutral_words.append(word)
101
+
102
+ word_sentiment_distribution = {
103
+ 'positive': len(positive_words),
104
+ 'neutral': len(neutral_words),
105
+ 'negative': len(negative_words)
106
+ }
107
+ else:
108
+ # Emotion model word-level sentiment analysis
109
+ emotion_counters = {
110
+ 'ANGER': 0, 'DISGUST': 0, 'FEAR': 0, 'JOY': 0, 'NEUTRAL': 0, 'SADNESS': 0, 'SURPRISE': 0
111
+ }
112
+ emotion_words = {
113
+ 'ANGER': [], 'DISGUST': [], 'FEAR': [], 'JOY': [], 'NEUTRAL': [], 'SADNESS': [], 'SURPRISE': []
114
+ }
115
+ for word in lemmatized_text:
116
+ word_sentiment, _ = analyze_sentiment(word, model_type=model_type)
117
+ if word_sentiment in emotion_counters:
118
+ emotion_counters[word_sentiment] += 1
119
+ emotion_words[word_sentiment].append(word)
120
+
121
+ word_sentiment_distribution = {
122
+ 'anger': emotion_counters['ANGER'],
123
+ 'disgust': emotion_counters['DISGUST'],
124
+ 'fear': emotion_counters['FEAR'],
125
+ 'joy': emotion_counters['JOY'],
126
+ 'neutral': emotion_counters['NEUTRAL'],
127
+ 'sadness': emotion_counters['SADNESS'],
128
+ 'surprise': emotion_counters['SURPRISE']
129
+ }
130
+
131
+ # Store the analysis result in global variable for download
132
+ analysis_result = {
133
+ 'sentiment': sentiment,
134
+ 'model_type': model_type,
135
+ 'cleaned_text': cleaned_text,
136
+ 'removed_text': removed_text,
137
+ 'normalized_text': normalized_text,
138
+ 'tokenized_text': tokenized_text,
139
+ 'stemmed_text': stemmed_text,
140
+ 'lemmatized_text': lemmatized_text,
141
+ 'ner': ner,
142
+ 'pos': pos,
143
+ 'original_text': text,
144
+ 'word_sentiment_distribution': word_sentiment_distribution,
145
+ 'positive_words': positive_words,
146
+ 'negative_words': negative_words,
147
+ 'neutral_words': neutral_words if model_type != 'emotion' else [],
148
+ 'emotion_words': emotion_words if model_type == 'emotion' else None
149
+ }
150
+
151
+ # Generate Word Cloud
152
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text_joined)
153
+ wordcloud_path = os.path.join('static', 'wordcloud.png')
154
+ wordcloud.to_file(wordcloud_path)
155
+
156
+ return render_template('index.html',
157
+ sentiment=sentiment,
158
+ cleaned_text=cleaned_text,
159
+ removed_text=removed_text,
160
+ normalized_text=normalized_text,
161
+ tokenized_text=tokenized_text,
162
+ stemmed_text=" ".join(stemmed_text),
163
+ lemmatized_text=" ".join(lemmatized_text),
164
+ ner=ner,
165
+ pos=pos,
166
+ probabilities=probabilities,
167
+ wordcloud_url=wordcloud_path,
168
+ word_sentiment_distribution=word_sentiment_distribution,
169
+ positive_words=positive_words,
170
+ negative_words=negative_words,
171
+ neutral_words=neutral_words if model_type != 'emotion' else [],
172
+ emotion_words=emotion_words if model_type == 'emotion' else None,
173
+ text=text,
174
+ model_type=model_type,
175
+ total_words=len(tokenized_text),
176
+ file_uploaded=file_uploaded)
177
+
178
+ except Exception as e:
179
+ print(f"Error: {e}")
180
+ return render_template('index.html',
181
+ error='An error occurred during analysis.',
182
+ text=text,
183
+ model_type=model_type,
184
+ file_uploaded=file_uploaded)
185
+
186
+ @app.route('/download')
187
+ def download_result():
188
+ global analysis_result
189
+ try:
190
+ if not analysis_result:
191
+ return "No analysis available for download", 400
192
+
193
+ # Build content for the TXT file
194
+ content = f"""
195
+ Sentiment
196
+ Overall Sentiment: {analysis_result['sentiment']}
197
+
198
+ Model Used
199
+ Selected Model: {analysis_result['model_type']}
200
+
201
+ Original Text:
202
+ {analysis_result['original_text']}
203
+
204
+ Text Preprocessing Results
205
+ Cleaned Text:
206
+ {analysis_result['cleaned_text']}
207
+
208
+ Removed Text:
209
+ {analysis_result['removed_text']}
210
+
211
+ Normalized Text:
212
+ {analysis_result['normalized_text']}
213
+
214
+ Tokenized Text:
215
+ {', '.join(analysis_result['tokenized_text'])}
216
+
217
+ Stemmed Text:
218
+ {" ".join(analysis_result['stemmed_text'])}
219
+
220
+ Lemmatized Text:
221
+ {" ".join(analysis_result['lemmatized_text'])}
222
+
223
+ Named Entities (NER):
224
+ {', '.join([f"{entity[0]} ({entity[1]})" for entity in analysis_result['ner']])}
225
+
226
+ POS Tags:
227
+ {', '.join([f"{word} ({tag})" for word, tag in analysis_result['pos']])}
228
+
229
+ Total Words: {len(analysis_result['tokenized_text'])}
230
+
231
+ """
232
+ # If the model is 'emotion', include emotion-based words
233
+ if analysis_result['model_type'] == 'emotion':
234
+ content += "\nEmotion-Specific Words:\n"
235
+ for emotion, words in analysis_result['emotion_words'].items():
236
+ content += f"{emotion.capitalize()} Words: {len(words)}\n"
237
+ content += f"{', '.join(words)}\n"
238
+
239
+ # Otherwise, include positive, neutral, and negative words for other models
240
+ else:
241
+ content += f"""
242
+ Positive Words: {len(analysis_result['positive_words'])}
243
+ {', '.join(analysis_result['positive_words'])}
244
+
245
+ Neutral Words: {len(analysis_result['neutral_words'])}
246
+ {', '.join(analysis_result['neutral_words'])}
247
+
248
+ Negative Words: {len(analysis_result['negative_words'])}
249
+ {', '.join(analysis_result['negative_words'])}
250
+ """
251
+
252
+ # Create a response object with the content
253
+ response = make_response(content)
254
+ response.headers["Content-Disposition"] = "attachment; filename=sentiment_analysis_result.txt"
255
+ response.headers["Content-Type"] = "text/plain"
256
+ return response
257
+ except Exception as e:
258
+ print(f"Error during file download: {e}")
259
+ return "Error in generating file", 500
260
+
261
+ if __name__ == '__main__':
262
+ port = int(os.environ.get('PORT', 7860))
263
+ app.run(host='0.0.0.0', port=port)
instance/sentiment_data.db ADDED
Binary file (8.19 kB). View file
 
nltk_data/corpora/wordnet.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbda5ea6eef7f36a97a43d4a75f85e07fccbb4f23657d27b4ccbc93e2646ab59
3
+ size 10775600
nltk_data/taggers/averaged_perceptron_tagger.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1f13cf2532daadfd6f3bc481a49859f0b8ea6432ccdcd83e6a49a5f19008de9
3
+ size 2526731
nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25a5a19c7ced7b2bac3831da5bc0afcc2c34e5dd01cd4f361bb799949a696238
3
+ size 6138625
nltk_data/tokenizers/punkt.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec
3
+ size 13905355
nltk_data/tokenizers/punkt/.DS_Store ADDED
Binary file (6.15 kB). View file
 
nltk_data/tokenizers/punkt/PY3/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt/PY3/czech.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64b0734b6fbe8e8d7cac79f48d1dd9f853824e57c4e3594dadd74ba2c1d97f50
3
+ size 1119050
nltk_data/tokenizers/punkt/PY3/danish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6189c7dd254e29e2bd406a7f6a4336297c8953214792466a790ea4444223ceb3
3
+ size 1191710
nltk_data/tokenizers/punkt/PY3/dutch.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fda0d6a13f02e8898daec7fe923da88e25abe081bcfa755c0e015075c215fe4c
3
+ size 693759
nltk_data/tokenizers/punkt/PY3/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
3
+ size 406697
nltk_data/tokenizers/punkt/PY3/estonian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b364f72538d17b146a98009ad239a8096ce6c0a8b02958c0bc776ecd0c58a25f
3
+ size 1499502
nltk_data/tokenizers/punkt/PY3/finnish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a4b5ff5500ee851c456f9dd40d5fc0d8c1859c88eb3178de1317d26b7d22833
3
+ size 1852226
nltk_data/tokenizers/punkt/PY3/french.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28e3a4cd2971989b3cb9fd3433a6f15d17981e464db2be039364313b5de94f29
3
+ size 553575
nltk_data/tokenizers/punkt/PY3/german.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddcbbe85e2042a019b1a6e37fd8c153286c38ba201fae0f5bfd9a3f74abae25c
3
+ size 1463575
nltk_data/tokenizers/punkt/PY3/greek.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85dabc44ab90a5f208ef37ff6b4892ebe7e740f71fb4da47cfd95417ca3e22fd
3
+ size 876006
nltk_data/tokenizers/punkt/PY3/italian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68a94007b1e4ffdc4d1a190185ca5442c3dafeb17ab39d30329e84cd74a43947
3
+ size 615089
nltk_data/tokenizers/punkt/PY3/malayalam.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
3
+ size 221207
nltk_data/tokenizers/punkt/PY3/norwegian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ff7a46d1438b311457d15d7763060b8d3270852c1850fd788c5cee194dc4a1d
3
+ size 1181271
nltk_data/tokenizers/punkt/PY3/polish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:624900ae3ddfb4854a98c5d3b8b1c9bb719975f33fee61ce1441dab9f8a00718
3
+ size 1738386
nltk_data/tokenizers/punkt/PY3/portuguese.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02a0b7b25c3c7471e1791b66a31bbb530afbb0160aee4fcecf0107652067b4a1
3
+ size 611919
nltk_data/tokenizers/punkt/PY3/russian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e
3
+ size 33020
nltk_data/tokenizers/punkt/PY3/slovene.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52ef2cc0ed27d79b3aa635cbbc40ad811883a75a4b8a8be1ae406972870fd864
3
+ size 734444
nltk_data/tokenizers/punkt/PY3/spanish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:164a50fadc5a49f8ec7426eae11d3111ee752b48a3ef373d47745011192a5984
3
+ size 562337
nltk_data/tokenizers/punkt/PY3/swedish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0f7d538bfd5266633b09e842cd92e9e0ac10f1d923bf211e1497972ddc47318
3
+ size 979681
nltk_data/tokenizers/punkt/PY3/turkish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae68ef5863728ac5332e87eb1f6bae772ff32a13a4caa2b01a5c68103e853c5b
3
+ size 1017038
nltk_data/tokenizers/punkt/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt/czech.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ba73d293c7d7953956bcf02f3695ec5c1f0d527f2a3c38097f5593394fa1690
3
+ size 1265552
nltk_data/tokenizers/punkt/danish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea29760a0a9197f52ca59e78aeafc5a6f55d05258faf7db1709b2b9eb321ef20
3
+ size 1264725
nltk_data/tokenizers/punkt/dutch.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a8e26b3d68c45c38e594d19e2d5677447bfdcaa636d3b1e7acfed0e9272d73c
3
+ size 742624
nltk_data/tokenizers/punkt/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dda37972ae88998a6fd3e3ec002697a6bd362b32d050fda7d7ca5276873092aa
3
+ size 433305
nltk_data/tokenizers/punkt/estonian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3867fee26a36bdb197c64362aa13ac683f5f33fa4d0d225a5d56707582a55a1d
3
+ size 1596714
nltk_data/tokenizers/punkt/finnish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a9e17b3d5b4df76345d812b8a65b1da0767eda5086eadcc11e625eef0942835
3
+ size 1951656
nltk_data/tokenizers/punkt/french.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de05f3d5647d3d2296626fb83f68428e4c6ad6e05a00ed4694c8bdc8f2f197ee
3
+ size 583482
nltk_data/tokenizers/punkt/german.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eab497fa085413130c8fd0fb13b929128930afe2f6a26ea8715c95df7088e97c
3
+ size 1526714
nltk_data/tokenizers/punkt/greek.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21752a6762fad5cfe46fb5c45fad9a85484a0e8e81c67e6af6fb973cfc27d67c
3
+ size 1953106
nltk_data/tokenizers/punkt/italian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcb2717d7be5f26e860a92e05acf69b1123a5f4527cd7a269a9ab9e9e668c805
3
+ size 658331
nltk_data/tokenizers/punkt/malayalam.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
3
+ size 221207
nltk_data/tokenizers/punkt/norwegian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4a97f8f9a03a0338dd746bcc89a0ae0f54ae43b835fa37d83e279e1ca794faf
3
+ size 1259779
nltk_data/tokenizers/punkt/polish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16127b6d10933427a3e90fb20e9be53e1fb371ff79a730c1030734ed80b90c92
3
+ size 2042451
nltk_data/tokenizers/punkt/portuguese.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb01bf7c79a4eadc2178bbd209665139a0e4b38f2d1c44fef097de93955140e0
3
+ size 649051
nltk_data/tokenizers/punkt/russian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909
3
+ size 33027
nltk_data/tokenizers/punkt/slovene.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dac650212b3787b39996c01bd2084115493e6f6ec390bab61f767525b08b8ea
3
+ size 832867
nltk_data/tokenizers/punkt/spanish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:271dc6027c4aae056f72a9bfab5645cf67e198bf4f972895844e40f5989ccdc3
3
+ size 597831