wordnet lemmatization and pos tagging in python wordnet lemmatization and pos tagging in python python python

wordnet lemmatization and pos tagging in python


First of all, you can use nltk.pos_tag() directly without training it. The function will load a pretrained tagger from a file. You can see the file name with nltk.tag._POS_TAGGER:

nltk.tag._POS_TAGGER>>> 'taggers/maxent_treebank_pos_tagger/english.pickle' 

As it was trained with the Treebank corpus, it also uses the Treebank tag set.

The following function would map the treebank tags to WordNet part of speech names:

from nltk.corpus import wordnetdef get_wordnet_pos(treebank_tag):    if treebank_tag.startswith('J'):        return wordnet.ADJ    elif treebank_tag.startswith('V'):        return wordnet.VERB    elif treebank_tag.startswith('N'):        return wordnet.NOUN    elif treebank_tag.startswith('R'):        return wordnet.ADV    else:        return ''

You can then use the return value with the lemmatizer:

from nltk.stem.wordnet import WordNetLemmatizerlemmatizer = WordNetLemmatizer()lemmatizer.lemmatize('going', wordnet.VERB)>>> 'go'

Check the return value before passing it to the Lemmatizer because an empty string would give a KeyError.


As in the source code of nltk.corpus.reader.wordnet (http://www.nltk.org/_modules/nltk/corpus/reader/wordnet.html)

#{ Part-of-speech constants ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'#}POS_LIST = [NOUN, VERB, ADJ, ADV]


Steps to convert : Document->Sentences->Tokens->POS->Lemmas

import nltkfrom nltk.stem import WordNetLemmatizerfrom nltk.corpus import wordnet#example text text = 'What can I say about this place. The staff of these restaurants is nice and the eggplant is not bad'class Splitter(object):    """    split the document into sentences and tokenize each sentence    """    def __init__(self):        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()    def split(self,text):        """        out : ['What', 'can', 'I', 'say', 'about', 'this', 'place', '.']        """        # split into single sentence        sentences = self.splitter.tokenize(text)        # tokenization in each sentences        tokens = [self.tokenizer.tokenize(sent) for sent in sentences]        return tokensclass LemmatizationWithPOSTagger(object):    def __init__(self):        pass    def get_wordnet_pos(self,treebank_tag):        """        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v)         """        if treebank_tag.startswith('J'):            return wordnet.ADJ        elif treebank_tag.startswith('V'):            return wordnet.VERB        elif treebank_tag.startswith('N'):            return wordnet.NOUN        elif treebank_tag.startswith('R'):            return wordnet.ADV        else:            # As default pos in lemmatization is Noun            return wordnet.NOUN    def pos_tag(self,tokens):        # find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....        pos_tokens = [nltk.pos_tag(token) for token in tokens]        # lemmatization using pos tagg           # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]        pos_tokens = [ [(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]        return pos_tokenslemmatizer = WordNetLemmatizer()splitter = Splitter()lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()#step 1 split document into sentence followed by tokenizationtokens = splitter.split(text)#step 2 lemmatization using pos tagger lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens)print(lemma_pos_token)