# -*- coding: utf-8 -*- """ basic_sentiment_analysis ~~~~~~~~~~~~~~~~~~~~~~~~ This module contains the code and examples described in http://fjavieralba.com/basic-sentiment-analysis-with-python.html """ from pprint import pprint import nltk import yaml import sys import os import re from App.bin.constants import ASSETS class Splitter(object): def __init__(self): self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle') self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer() def split(self, text): """ input format: a paragraph of text output format: a list of lists of words. e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']] """ sentences = self.nltk_splitter.tokenize(text) tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences] return tokenized_sentences class POSTagger(object): def __init__(self): pass def pos_tag(self, sentences): """ input format: list of lists of words e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']] output format: list of lists of tagged tokens. Each tagged tokens has a form, a lemma, and a list of tags e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])], [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]] """ pos = [nltk.pos_tag(sentence) for sentence in sentences] # adapt format pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos] return pos class DictionaryTagger(object): def __init__(self, dictionary_paths): files = [open(path, 'r') for path in dictionary_paths] dictionaries = [yaml.safe_load(dict_file) for dict_file in files] map(lambda x: x.close(), files) self.dictionary = {} self.max_key_size = 0 for curr_dict in dictionaries: for key in curr_dict: if key in self.dictionary: self.dictionary[key].extend(curr_dict[key]) else: self.dictionary[key] = curr_dict[key] self.max_key_size = max(self.max_key_size, len(key)) def tag(self, postagged_sentences): return [self.tag_sentence(sentence) for sentence in postagged_sentences] def tag_sentence(self, sentence, tag_with_lemmas=False): """ the result is only one tagging of all the possible ones. The resulting tagging is determined by these two priority rules: - longest matches have higher priority - search is made from left to right """ tag_sentence = [] N = len(sentence) if self.max_key_size == 0: self.max_key_size = N i = 0 while (i < N): j = min(i + self.max_key_size, N) # avoid overflow tagged = False while (j > i): expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower() expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower() if tag_with_lemmas: literal = expression_lemma else: literal = expression_form if literal in self.dictionary: # self.logger.debug("found: %s" % literal) is_single_token = j - i == 1 original_position = i i = j taggings = [tag for tag in self.dictionary[literal]] tagged_expression = (expression_form, expression_lemma, taggings) if is_single_token: # if the tagged literal is a single token, conserve its previous taggings: original_token_tagging = sentence[original_position][2] tagged_expression[2].extend(original_token_tagging) tag_sentence.append(tagged_expression) tagged = True else: j = j - 1 if not tagged: tag_sentence.append(sentence[i]) i += 1 return tag_sentence class ClassifyWithIncr_it(object): def __init__(self): print("printing") def value_of(self,sentiment): if sentiment == 'positive': return 1 if sentiment == 'negative': return -1 return 0 def sentence_score(self, sentence_tokens, previous_token, acum_score): if not sentence_tokens: return acum_score else: current_token = sentence_tokens[0] tags = current_token[2] token_score = sum([self.value_of(tag) for tag in tags]) if previous_token is not None: previous_tags = previous_token[2] if 'inc' in previous_tags: token_score *= 2.0 elif 'dec' in previous_tags: token_score /= 2.0 elif 'inv' in previous_tags: token_score *= -1.0 return self.sentence_score(sentence_tokens[1:], current_token, acum_score + token_score) def sentiment_score(self,review): return sum([self.sentence_score(sentence, None, 0.0) for sentence in review]) def main(self,sentence): splitter = Splitter() postagger = POSTagger() pos=ASSETS+"dicts/positive.yml" neg= ASSETS+"dicts/negative.yml" inc=ASSETS+"dicts/inc.yml" dec=ASSETS+"dicts/dec.yml" inv=ASSETS+"dicts/inv.yml" dicttagger = DictionaryTagger([pos, neg, inc, dec, inv]) splitted_sentences = splitter.split(sentence) pos_tagged_sentences = postagger.pos_tag(splitted_sentences) dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences) print("Classification...") result = self.sentiment_score(dict_tagged_sentences) print (result) if result < 0: polarity = "problem" elif result > 0: polarity ="partialSolution" else: polarity = "neutre" return polarity if __name__ == '__main__': text = """this/these can be annoying""" test = ClassifyWithIncr_it() print(test.main(text))