Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| """ | |
| basic_sentiment_analysis | |
| ~~~~~~~~~~~~~~~~~~~~~~~~ | |
| This module contains the code and examples described in | |
| http://fjavieralba.com/basic-sentiment-analysis-with-python.html | |
| """ | |
| from pprint import pprint | |
| import nltk | |
| import yaml | |
| import sys | |
| import os | |
| import re | |
| from App.bin.constants import ASSETS | |
| class Splitter(object): | |
| def __init__(self): | |
| self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle') | |
| self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer() | |
| def split(self, text): | |
| """ | |
| input format: a paragraph of text | |
| output format: a list of lists of words. | |
| e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']] | |
| """ | |
| sentences = self.nltk_splitter.tokenize(text) | |
| tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences] | |
| return tokenized_sentences | |
| class POSTagger(object): | |
| def __init__(self): | |
| pass | |
| def pos_tag(self, sentences): | |
| """ | |
| input format: list of lists of words | |
| e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']] | |
| output format: list of lists of tagged tokens. Each tagged tokens has a | |
| form, a lemma, and a list of tags | |
| e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])], | |
| [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]] | |
| """ | |
| pos = [nltk.pos_tag(sentence) for sentence in sentences] | |
| # adapt format | |
| pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos] | |
| return pos | |
| class DictionaryTagger(object): | |
| def __init__(self, dictionary_paths): | |
| files = [open(path, 'r') for path in dictionary_paths] | |
| dictionaries = [yaml.safe_load(dict_file) for dict_file in files] | |
| map(lambda x: x.close(), files) | |
| self.dictionary = {} | |
| self.max_key_size = 0 | |
| for curr_dict in dictionaries: | |
| for key in curr_dict: | |
| if key in self.dictionary: | |
| self.dictionary[key].extend(curr_dict[key]) | |
| else: | |
| self.dictionary[key] = curr_dict[key] | |
| self.max_key_size = max(self.max_key_size, len(key)) | |
| def tag(self, postagged_sentences): | |
| return [self.tag_sentence(sentence) for sentence in postagged_sentences] | |
| def tag_sentence(self, sentence, tag_with_lemmas=False): | |
| """ | |
| the result is only one tagging of all the possible ones. | |
| The resulting tagging is determined by these two priority rules: | |
| - longest matches have higher priority | |
| - search is made from left to right | |
| """ | |
| tag_sentence = [] | |
| N = len(sentence) | |
| if self.max_key_size == 0: | |
| self.max_key_size = N | |
| i = 0 | |
| while (i < N): | |
| j = min(i + self.max_key_size, N) # avoid overflow | |
| tagged = False | |
| while (j > i): | |
| expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower() | |
| expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower() | |
| if tag_with_lemmas: | |
| literal = expression_lemma | |
| else: | |
| literal = expression_form | |
| if literal in self.dictionary: | |
| # self.logger.debug("found: %s" % literal) | |
| is_single_token = j - i == 1 | |
| original_position = i | |
| i = j | |
| taggings = [tag for tag in self.dictionary[literal]] | |
| tagged_expression = (expression_form, expression_lemma, taggings) | |
| if is_single_token: # if the tagged literal is a single token, conserve its previous taggings: | |
| original_token_tagging = sentence[original_position][2] | |
| tagged_expression[2].extend(original_token_tagging) | |
| tag_sentence.append(tagged_expression) | |
| tagged = True | |
| else: | |
| j = j - 1 | |
| if not tagged: | |
| tag_sentence.append(sentence[i]) | |
| i += 1 | |
| return tag_sentence | |
| class ClassifyWithIncr_it(object): | |
| def __init__(self): | |
| print("printing") | |
| def value_of(self,sentiment): | |
| if sentiment == 'positive': return 1 | |
| if sentiment == 'negative': return -1 | |
| return 0 | |
| def sentence_score(self, sentence_tokens, previous_token, acum_score): | |
| if not sentence_tokens: | |
| return acum_score | |
| else: | |
| current_token = sentence_tokens[0] | |
| tags = current_token[2] | |
| token_score = sum([self.value_of(tag) for tag in tags]) | |
| if previous_token is not None: | |
| previous_tags = previous_token[2] | |
| if 'inc' in previous_tags: | |
| token_score *= 2.0 | |
| elif 'dec' in previous_tags: | |
| token_score /= 2.0 | |
| elif 'inv' in previous_tags: | |
| token_score *= -1.0 | |
| return self.sentence_score(sentence_tokens[1:], current_token, acum_score + token_score) | |
| def sentiment_score(self,review): | |
| return sum([self.sentence_score(sentence, None, 0.0) for sentence in review]) | |
| def main(self,sentence): | |
| splitter = Splitter() | |
| postagger = POSTagger() | |
| pos=ASSETS+"dicts/positive.yml" | |
| neg= ASSETS+"dicts/negative.yml" | |
| inc=ASSETS+"dicts/inc.yml" | |
| dec=ASSETS+"dicts/dec.yml" | |
| inv=ASSETS+"dicts/inv.yml" | |
| dicttagger = DictionaryTagger([pos, neg, | |
| inc, dec, inv]) | |
| splitted_sentences = splitter.split(sentence) | |
| pos_tagged_sentences = postagger.pos_tag(splitted_sentences) | |
| dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences) | |
| print("Classification...") | |
| result = self.sentiment_score(dict_tagged_sentences) | |
| print (result) | |
| if result < 0: | |
| polarity = "problem" | |
| elif result > 0: | |
| polarity ="partialSolution" | |
| else: | |
| polarity = "neutre" | |
| return polarity | |
| if __name__ == '__main__': | |
| text = """this/these can be annoying""" | |
| test = ClassifyWithIncr_it() | |
| print(test.main(text)) | |