Spaces:

aradhyapavan
/

nlp-sentiment-analysis-pretarined-models

Running

App Files Files Community

aradhyapavan commited on Sep 2

Commit

7cb1242

verified ·

1 Parent(s): 0cca74f

Sentiment analysis using pretrained models

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +12 -0
.gitattributes +2 -0
.slugignore +8 -0
Dockerfile +68 -0
Procfile +1 -0
app.py +263 -0
instance/sentiment_data.db +0 -0
nltk_data/corpora/wordnet.zip +3 -0
nltk_data/taggers/averaged_perceptron_tagger.zip +3 -0
nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle +3 -0
nltk_data/tokenizers/punkt.zip +3 -0
nltk_data/tokenizers/punkt/.DS_Store +0 -0
nltk_data/tokenizers/punkt/PY3/README +98 -0
nltk_data/tokenizers/punkt/PY3/czech.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/danish.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/dutch.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/estonian.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/finnish.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/french.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/german.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/greek.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/italian.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/malayalam.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/norwegian.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/polish.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/portuguese.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/russian.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/slovene.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/spanish.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/swedish.pickle +3 -0
nltk_data/tokenizers/punkt/PY3/turkish.pickle +3 -0
nltk_data/tokenizers/punkt/README +98 -0
nltk_data/tokenizers/punkt/czech.pickle +3 -0
nltk_data/tokenizers/punkt/danish.pickle +3 -0
nltk_data/tokenizers/punkt/dutch.pickle +3 -0
nltk_data/tokenizers/punkt/english.pickle +3 -0
nltk_data/tokenizers/punkt/estonian.pickle +3 -0
nltk_data/tokenizers/punkt/finnish.pickle +3 -0
nltk_data/tokenizers/punkt/french.pickle +3 -0
nltk_data/tokenizers/punkt/german.pickle +3 -0
nltk_data/tokenizers/punkt/greek.pickle +3 -0
nltk_data/tokenizers/punkt/italian.pickle +3 -0
nltk_data/tokenizers/punkt/malayalam.pickle +3 -0
nltk_data/tokenizers/punkt/norwegian.pickle +3 -0
nltk_data/tokenizers/punkt/polish.pickle +3 -0
nltk_data/tokenizers/punkt/portuguese.pickle +3 -0
nltk_data/tokenizers/punkt/russian.pickle +3 -0
nltk_data/tokenizers/punkt/slovene.pickle +3 -0
nltk_data/tokenizers/punkt/spanish.pickle +3 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,12 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.sqlite3
+instance/
+.git
+.gitignore
+.env
+*.log
+before.zip
+wordcloud.png

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+static/nlpa.png filter=lfs diff=lfs merge=lfs -text
+static/wordcloud.png filter=lfs diff=lfs merge=lfs -text

.slugignore ADDED Viewed

	@@ -0,0 +1,8 @@

+.git
+__pycache__
+*.pyc
+node_modules/
+tests/
+*.log
+nltk_data/

Dockerfile ADDED Viewed

	@@ -0,0 +1,68 @@

+# syntax=docker/dockerfile:1
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PORT=7860 \
+    HF_HOME=/app/.cache/huggingface \
+    NLTK_DATA=/app/nltk_data \
+    MPLCONFIGDIR=/app/.config/matplotlib
+# System deps (build tools and libs for pillow/wordcloud)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    gcc \
+    g++ \
+    libjpeg-dev \
+    zlib1g-dev \
+    libpng-dev \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Install Python deps first (better layer caching)
+COPY requirements.txt ./
+RUN pip install --upgrade pip && \
+    pip install -r requirements.txt
+# Prepare writable caches
+RUN mkdir -p ${HF_HOME} ${NLTK_DATA} ${MPLCONFIGDIR}
+# Copy application code
+COPY . .
+# Ensure writable permissions for runtime (Spaces/K8s non-root scenarios)
+RUN chmod -R 777 /app
+# Run postbuild (e.g., install spaCy model) if present
+RUN if [ -f postbuild ]; then sh postbuild; else python -m spacy download en_core_web_md; fi
+# Pre-download NLTK data to writable dir
+RUN python - <<'PY'
+import nltk, os
+os.makedirs(os.environ.get('NLTK_DATA','/app/nltk_data'), exist_ok=True)
+for pkg in ['punkt','punkt_tab','wordnet','averaged_perceptron_tagger']:
+    try:
+        nltk.download(pkg, download_dir=os.environ['NLTK_DATA'])
+    except Exception as e:
+        print('NLTK download failed for', pkg, e)
+PY
+# Preload HF transformer models to writable cache
+RUN python - <<'PY'
+from transformers import pipeline
+# DistilBERT SST-2
+pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
+# RoBERTa Twitter
+pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment')
+# Emotion model
+pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base')
+PY
+# Expose default port (can be overridden by $PORT)
+EXPOSE 7860
+# Start the app using gunicorn (respects $PORT)
+CMD ["sh", "-c", "gunicorn -b 0.0.0.0:${PORT:-7860} app:app"]

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: gunicorn app:app

app.py ADDED Viewed

	@@ -0,0 +1,263 @@

+from flask import Flask, request, render_template, make_response
+from flask_sqlalchemy import SQLAlchemy
+from sentiment_model import preprocess_text, analyze_sentiment, read_file
+from wordcloud import WordCloud
+import os
+import nltk
+# Ensure NLTK uses a writable directory inside the container
+NLTK_DIR = os.environ.get('NLTK_DATA', os.path.join(os.getcwd(), 'nltk_data'))
+os.makedirs(NLTK_DIR, exist_ok=True)
+if NLTK_DIR not in nltk.data.path:
+    nltk.data.path.insert(0, NLTK_DIR)
+# Download required NLTK resources to the writable dir (no-op if present)
+for pkg in ['punkt', 'punkt_tab', 'wordnet', 'averaged_perceptron_tagger']:
+    try:
+        nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)
+    except Exception:
+        pass
+app = Flask(__name__, static_folder='static')
+app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///sentiment_data.db'
+app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
+db = SQLAlchemy(app)
+# Define SentimentRecord model
+class SentimentRecord(db.Model):
+    id = db.Column(db.Integer, primary_key=True)
+    original_text = db.Column(db.Text, nullable=False)
+    cleaned_text = db.Column(db.Text, nullable=False)
+    removed_text = db.Column(db.Text, nullable=False)
+    normalized_text = db.Column(db.Text, nullable=False)
+    tokenized_text = db.Column(db.Text, nullable=False)
+    stemmed_text = db.Column(db.Text, nullable=False)
+    lemmatized_text = db.Column(db.Text, nullable=False)
+    sentiment = db.Column(db.String(20), nullable=False)
+    ner = db.Column(db.Text, nullable=False)
+    pos = db.Column(db.Text, nullable=False)
+with app.app_context():
+    db.create_all()
+# Global variables to store the analysis result
+analysis_result = {}
+@app.route('/')
+def home():
+    return render_template('index.html',
+                           sentiment=None,
+                           text=None,
+                           file_uploaded=None,
+                           model_type='default')
+@app.route('/analyze', methods=['POST'])
+def analyze():
+    global analysis_result  # To store the results globally for the download
+    text = request.form.get('text', '').strip()
+    file = request.files.get('file')
+    model_type = request.form.get('model_type', 'default')
+    file_uploaded = False
+    if file and file.filename != '':
+        text = read_file(file)
+        file_uploaded = True
+    if not text or len(text.split()) < 4:
+        return render_template('index.html',
+                               error='Please provide at least 4 words for analysis.',
+                               text=text,
+                               model_type=model_type,
+                               file_uploaded=file_uploaded)
+    word_count = len(text.split())
+    if word_count > 300:
+        return render_template('index.html',
+                               error='Input text exceeds the 300-word limit.',
+                               text=text,
+                               model_type=model_type,
+                               file_uploaded=file_uploaded)
+    try:
+        # Step 1: Preprocess text (cleaning, normalization, etc.)
+        cleaned_text, removed_text, normalized_text, tokenized_text, stemmed_text, lemmatized_text, ner, pos = preprocess_text(text)
+        # Step 2: Use lemmatized text for sentiment analysis
+        lemmatized_text_joined = " ".join(lemmatized_text)
+        sentiment, probabilities = analyze_sentiment(lemmatized_text_joined, model_type=model_type)
+        # Word-level sentiment analysis
+        neutral_words, positive_words, negative_words = [], [], []
+        if model_type != 'emotion':
+            for word in lemmatized_text:
+                word_sentiment, _ = analyze_sentiment(word, model_type=model_type)
+                if word_sentiment == 'POSITIVE':
+                    positive_words.append(word)
+                elif word_sentiment == 'NEGATIVE':
+                    negative_words.append(word)
+                elif word_sentiment == 'NEUTRAL':
+                    neutral_words.append(word)
+            word_sentiment_distribution = {
+                'positive': len(positive_words),
+                'neutral': len(neutral_words),
+                'negative': len(negative_words)
+            }
+        else:
+            # Emotion model word-level sentiment analysis
+            emotion_counters = {
+                'ANGER': 0, 'DISGUST': 0, 'FEAR': 0, 'JOY': 0, 'NEUTRAL': 0, 'SADNESS': 0, 'SURPRISE': 0
+            }
+            emotion_words = {
+                'ANGER': [], 'DISGUST': [], 'FEAR': [], 'JOY': [], 'NEUTRAL': [], 'SADNESS': [], 'SURPRISE': []
+            }
+            for word in lemmatized_text:
+                word_sentiment, _ = analyze_sentiment(word, model_type=model_type)
+                if word_sentiment in emotion_counters:
+                    emotion_counters[word_sentiment] += 1
+                    emotion_words[word_sentiment].append(word)
+            word_sentiment_distribution = {
+                'anger': emotion_counters['ANGER'],
+                'disgust': emotion_counters['DISGUST'],
+                'fear': emotion_counters['FEAR'],
+                'joy': emotion_counters['JOY'],
+                'neutral': emotion_counters['NEUTRAL'],
+                'sadness': emotion_counters['SADNESS'],
+                'surprise': emotion_counters['SURPRISE']
+            }
+        # Store the analysis result in global variable for download
+        analysis_result = {
+            'sentiment': sentiment,
+            'model_type': model_type,
+            'cleaned_text': cleaned_text,
+            'removed_text': removed_text,
+            'normalized_text': normalized_text,
+            'tokenized_text': tokenized_text,
+            'stemmed_text': stemmed_text,
+            'lemmatized_text': lemmatized_text,
+            'ner': ner,
+            'pos': pos,
+            'original_text': text,
+            'word_sentiment_distribution': word_sentiment_distribution,
+            'positive_words': positive_words,
+            'negative_words': negative_words,
+            'neutral_words': neutral_words if model_type != 'emotion' else [],
+            'emotion_words': emotion_words if model_type == 'emotion' else None
+        }
+        # Generate Word Cloud
+        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text_joined)
+        wordcloud_path = os.path.join('static', 'wordcloud.png')
+        wordcloud.to_file(wordcloud_path)
+        return render_template('index.html',
+                               sentiment=sentiment,
+                               cleaned_text=cleaned_text,
+                               removed_text=removed_text,
+                               normalized_text=normalized_text,
+                               tokenized_text=tokenized_text,
+                               stemmed_text=" ".join(stemmed_text),
+                               lemmatized_text=" ".join(lemmatized_text),
+                               ner=ner,
+                               pos=pos,
+                               probabilities=probabilities,
+                               wordcloud_url=wordcloud_path,
+                               word_sentiment_distribution=word_sentiment_distribution,
+                               positive_words=positive_words,
+                               negative_words=negative_words,
+                               neutral_words=neutral_words if model_type != 'emotion' else [],
+                               emotion_words=emotion_words if model_type == 'emotion' else None,
+                               text=text,
+                               model_type=model_type,
+                               total_words=len(tokenized_text),
+                               file_uploaded=file_uploaded)
+    except Exception as e:
+        print(f"Error: {e}")
+        return render_template('index.html',
+                               error='An error occurred during analysis.',
+                               text=text,
+                               model_type=model_type,
+                               file_uploaded=file_uploaded)
+@app.route('/download')
+def download_result():
+    global analysis_result
+    try:
+        if not analysis_result:
+            return "No analysis available for download", 400
+        # Build content for the TXT file
+        content = f"""
+Sentiment
+Overall Sentiment: {analysis_result['sentiment']}
+Model Used
+Selected Model: {analysis_result['model_type']}
+Original Text:
+{analysis_result['original_text']}
+Text Preprocessing Results
+Cleaned Text:
+{analysis_result['cleaned_text']}
+Removed Text:
+{analysis_result['removed_text']}
+Normalized Text:
+{analysis_result['normalized_text']}
+Tokenized Text:
+{', '.join(analysis_result['tokenized_text'])}
+Stemmed Text:
+{" ".join(analysis_result['stemmed_text'])}
+Lemmatized Text:
+{" ".join(analysis_result['lemmatized_text'])}
+Named Entities (NER):
+{', '.join([f"{entity[0]} ({entity[1]})" for entity in analysis_result['ner']])}
+POS Tags:
+{', '.join([f"{word} ({tag})" for word, tag in analysis_result['pos']])}
+Total Words: {len(analysis_result['tokenized_text'])}
+"""
+        # If the model is 'emotion', include emotion-based words
+        if analysis_result['model_type'] == 'emotion':
+            content += "\nEmotion-Specific Words:\n"
+            for emotion, words in analysis_result['emotion_words'].items():
+                content += f"{emotion.capitalize()} Words: {len(words)}\n"
+                content += f"{', '.join(words)}\n"
+        # Otherwise, include positive, neutral, and negative words for other models
+        else:
+            content += f"""
+Positive Words: {len(analysis_result['positive_words'])}
+{', '.join(analysis_result['positive_words'])}
+Neutral Words: {len(analysis_result['neutral_words'])}
+{', '.join(analysis_result['neutral_words'])}
+Negative Words: {len(analysis_result['negative_words'])}
+{', '.join(analysis_result['negative_words'])}
+"""
+        # Create a response object with the content
+        response = make_response(content)
+        response.headers["Content-Disposition"] = "attachment; filename=sentiment_analysis_result.txt"
+        response.headers["Content-Type"] = "text/plain"
+        return response
+    except Exception as e:
+        print(f"Error during file download: {e}")
+        return "Error in generating file", 500
+if __name__ == '__main__':
+    port = int(os.environ.get('PORT', 7860))
+    app.run(host='0.0.0.0', port=port)

instance/sentiment_data.db ADDED Viewed

Binary file (8.19 kB). View file

nltk_data/corpora/wordnet.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbda5ea6eef7f36a97a43d4a75f85e07fccbb4f23657d27b4ccbc93e2646ab59
+size 10775600

nltk_data/taggers/averaged_perceptron_tagger.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1f13cf2532daadfd6f3bc481a49859f0b8ea6432ccdcd83e6a49a5f19008de9
+size 2526731

nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25a5a19c7ced7b2bac3831da5bc0afcc2c34e5dd01cd4f361bb799949a696238
+size 6138625

nltk_data/tokenizers/punkt.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec
+size 13905355

nltk_data/tokenizers/punkt/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

nltk_data/tokenizers/punkt/PY3/README ADDED Viewed

	@@ -0,0 +1,98 @@

+Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
+Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
+been contributed by various people using NLTK for sentence boundary detection.
+For information about how to use these models, please confer the tokenization HOWTO:
+http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
+and chapter 3.8 of the NLTK book:
+http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
+There are pretrained tokenizers for the following languages:
+File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
+=======================================================================================================================================================================
+czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
+                                                                           Literarni Noviny
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
+                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
+                    (American)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
+                                        Text Bank (Suomen Kielen           newspapers
+                                        Tekstipankki)
+                                        Finnish Center for IT Science
+                                        (CSC)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
+                    (Switzerland)       CD-ROM
+                    (Uses "ss"
+                     instead of "ß")
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
+                    (Bokmål and         Information Technologies,
+                     Nynorsk)           Bergen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
+                                        (http://www.nkjp.pl/)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
+                    (Brazilian)         (Linguateca)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
+                                        Slovene Academy for Arts
+                                        and Sciences
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
+                                                                           (and some other texts)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
+                                        (Türkçe Derlem Projesi)
+                                        University of Ankara
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
+Unicode using the codecs module.
+Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
+Computational Linguistics 32: 485-525.
+---- Training Code ----
+# import punkt
+import nltk.tokenize.punkt
+# Make a new Tokenizer
+tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
+# Read in training corpus (one example: Slovene)
+import codecs
+text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
+# Train tokenizer
+tokenizer.train(text)
+# Dump pickled tokenizer
+import pickle
+out = open("slovene.pickle","wb")
+pickle.dump(tokenizer, out)
+out.close()
+---------

nltk_data/tokenizers/punkt/PY3/czech.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64b0734b6fbe8e8d7cac79f48d1dd9f853824e57c4e3594dadd74ba2c1d97f50
+size 1119050

nltk_data/tokenizers/punkt/PY3/danish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6189c7dd254e29e2bd406a7f6a4336297c8953214792466a790ea4444223ceb3
+size 1191710

nltk_data/tokenizers/punkt/PY3/dutch.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fda0d6a13f02e8898daec7fe923da88e25abe081bcfa755c0e015075c215fe4c
+size 693759

nltk_data/tokenizers/punkt/PY3/english.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
+size 406697

nltk_data/tokenizers/punkt/PY3/estonian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b364f72538d17b146a98009ad239a8096ce6c0a8b02958c0bc776ecd0c58a25f
+size 1499502

nltk_data/tokenizers/punkt/PY3/finnish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a4b5ff5500ee851c456f9dd40d5fc0d8c1859c88eb3178de1317d26b7d22833
+size 1852226

nltk_data/tokenizers/punkt/PY3/french.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28e3a4cd2971989b3cb9fd3433a6f15d17981e464db2be039364313b5de94f29
+size 553575

nltk_data/tokenizers/punkt/PY3/german.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddcbbe85e2042a019b1a6e37fd8c153286c38ba201fae0f5bfd9a3f74abae25c
+size 1463575

nltk_data/tokenizers/punkt/PY3/greek.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85dabc44ab90a5f208ef37ff6b4892ebe7e740f71fb4da47cfd95417ca3e22fd
+size 876006

nltk_data/tokenizers/punkt/PY3/italian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68a94007b1e4ffdc4d1a190185ca5442c3dafeb17ab39d30329e84cd74a43947
+size 615089

nltk_data/tokenizers/punkt/PY3/malayalam.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
+size 221207

nltk_data/tokenizers/punkt/PY3/norwegian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ff7a46d1438b311457d15d7763060b8d3270852c1850fd788c5cee194dc4a1d
+size 1181271

nltk_data/tokenizers/punkt/PY3/polish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:624900ae3ddfb4854a98c5d3b8b1c9bb719975f33fee61ce1441dab9f8a00718
+size 1738386

nltk_data/tokenizers/punkt/PY3/portuguese.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02a0b7b25c3c7471e1791b66a31bbb530afbb0160aee4fcecf0107652067b4a1
+size 611919

nltk_data/tokenizers/punkt/PY3/russian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e
+size 33020

nltk_data/tokenizers/punkt/PY3/slovene.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52ef2cc0ed27d79b3aa635cbbc40ad811883a75a4b8a8be1ae406972870fd864
+size 734444

nltk_data/tokenizers/punkt/PY3/spanish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:164a50fadc5a49f8ec7426eae11d3111ee752b48a3ef373d47745011192a5984
+size 562337

nltk_data/tokenizers/punkt/PY3/swedish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0f7d538bfd5266633b09e842cd92e9e0ac10f1d923bf211e1497972ddc47318
+size 979681

nltk_data/tokenizers/punkt/PY3/turkish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae68ef5863728ac5332e87eb1f6bae772ff32a13a4caa2b01a5c68103e853c5b
+size 1017038

nltk_data/tokenizers/punkt/README ADDED Viewed

	@@ -0,0 +1,98 @@

+Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
+Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
+been contributed by various people using NLTK for sentence boundary detection.
+For information about how to use these models, please confer the tokenization HOWTO:
+http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
+and chapter 3.8 of the NLTK book:
+http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
+There are pretrained tokenizers for the following languages:
+File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
+=======================================================================================================================================================================
+czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
+                                                                           Literarni Noviny
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
+                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
+                    (American)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
+                                        Text Bank (Suomen Kielen           newspapers
+                                        Tekstipankki)
+                                        Finnish Center for IT Science
+                                        (CSC)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
+                    (Switzerland)       CD-ROM
+                    (Uses "ss"
+                     instead of "ß")
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
+                    (Bokmål and         Information Technologies,
+                     Nynorsk)           Bergen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
+                                        (http://www.nkjp.pl/)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
+                    (Brazilian)         (Linguateca)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
+                                        Slovene Academy for Arts
+                                        and Sciences
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
+                                                                           (and some other texts)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
+                                        (Türkçe Derlem Projesi)
+                                        University of Ankara
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
+Unicode using the codecs module.
+Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
+Computational Linguistics 32: 485-525.
+---- Training Code ----
+# import punkt
+import nltk.tokenize.punkt
+# Make a new Tokenizer
+tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
+# Read in training corpus (one example: Slovene)
+import codecs
+text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
+# Train tokenizer
+tokenizer.train(text)
+# Dump pickled tokenizer
+import pickle
+out = open("slovene.pickle","wb")
+pickle.dump(tokenizer, out)
+out.close()
+---------

nltk_data/tokenizers/punkt/czech.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ba73d293c7d7953956bcf02f3695ec5c1f0d527f2a3c38097f5593394fa1690
+size 1265552

nltk_data/tokenizers/punkt/danish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea29760a0a9197f52ca59e78aeafc5a6f55d05258faf7db1709b2b9eb321ef20
+size 1264725

nltk_data/tokenizers/punkt/dutch.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a8e26b3d68c45c38e594d19e2d5677447bfdcaa636d3b1e7acfed0e9272d73c
+size 742624

nltk_data/tokenizers/punkt/english.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dda37972ae88998a6fd3e3ec002697a6bd362b32d050fda7d7ca5276873092aa
+size 433305

nltk_data/tokenizers/punkt/estonian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3867fee26a36bdb197c64362aa13ac683f5f33fa4d0d225a5d56707582a55a1d
+size 1596714

nltk_data/tokenizers/punkt/finnish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a9e17b3d5b4df76345d812b8a65b1da0767eda5086eadcc11e625eef0942835
+size 1951656

nltk_data/tokenizers/punkt/french.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de05f3d5647d3d2296626fb83f68428e4c6ad6e05a00ed4694c8bdc8f2f197ee
+size 583482

nltk_data/tokenizers/punkt/german.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eab497fa085413130c8fd0fb13b929128930afe2f6a26ea8715c95df7088e97c
+size 1526714

nltk_data/tokenizers/punkt/greek.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21752a6762fad5cfe46fb5c45fad9a85484a0e8e81c67e6af6fb973cfc27d67c
+size 1953106

nltk_data/tokenizers/punkt/italian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcb2717d7be5f26e860a92e05acf69b1123a5f4527cd7a269a9ab9e9e668c805
+size 658331

nltk_data/tokenizers/punkt/malayalam.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
+size 221207

nltk_data/tokenizers/punkt/norwegian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4a97f8f9a03a0338dd746bcc89a0ae0f54ae43b835fa37d83e279e1ca794faf
+size 1259779

nltk_data/tokenizers/punkt/polish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16127b6d10933427a3e90fb20e9be53e1fb371ff79a730c1030734ed80b90c92
+size 2042451

nltk_data/tokenizers/punkt/portuguese.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb01bf7c79a4eadc2178bbd209665139a0e4b38f2d1c44fef097de93955140e0
+size 649051

nltk_data/tokenizers/punkt/russian.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909
+size 33027

nltk_data/tokenizers/punkt/slovene.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7dac650212b3787b39996c01bd2084115493e6f6ec390bab61f767525b08b8ea
+size 832867

nltk_data/tokenizers/punkt/spanish.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:271dc6027c4aae056f72a9bfab5645cf67e198bf4f972895844e40f5989ccdc3
+size 597831