from mrg_utils import * import os posDict = {} for dir in os.listdir(os.environ['PENN_TREEBANK_DIR']+'/combined/wsj'): if len(dir.split('.')) != 1: continue print dir for file in os.listdir('%s/combined/wsj/%s' % (os.environ['PENN_TREEBANK_DIR'], dir)): if file.split('.')[-1] != 'mrg': continue doc = MRG_Document('%s/combined/wsj/%s/%s' % (os.environ['PENN_TREEBANK_DIR'],dir, file)) for word in doc.allWords: posDict[word.pos] = posDict.get(word.pos, 0) + 1 reversePosDict = {} for key in posDict.keys(): reversePosDict[posDict[key]] = key nums = reversePosDict.keys() nums.sort() nums.reverse() for num in nums: print reversePosDict[num], num