counting n-gram frequency in python nltk
NLTK comes with its own bigrams generator
, as well as a convenient FreqDist()
function.
f = open('a_text_file')raw = f.read()tokens = nltk.word_tokenize(raw)#Create your bigramsbgs = nltk.bigrams(tokens)#compute frequency distribution for all the bigrams in the textfdist = nltk.FreqDist(bgs)for k,v in fdist.items(): print k,v
Once you have access to the BiGrams and the frequency distributions, you can filter according to your needs.
Hope that helps.
from nltk import FreqDistfrom nltk.util import ngrams def compute_freq(): textfile = open('corpus.txt','r') bigramfdist = FreqDist() threeramfdist = FreqDist() for line in textfile: if len(line) > 1: tokens = line.strip().split(' ') bigrams = ngrams(tokens, 2) bigramfdist.update(bigrams)compute_freq()