I am trying to extract phrases from my corpus for this i have defined two rules one is noun followed by multiple nouns and other is adjective followed by noun, here i want that if same phrase is extracted from both rules the program should ignore second one, the problem I am facing is that the phrases are extracted form the first rule only and the second rule is not being applied. below is the code:
PATTERN = r"""
      NP: {<NN><NN>+}
      {<ADJ><NN>*}
       """
    MIN_FREQ = 1
    MIN_CVAL = -13 # lowest cval -13
    def __init__(self):
        corpus_root = os.path.abspath('../multiwords/test')
        self.corpus = nltk.corpus.reader.TaggedCorpusReader(corpus_root,'.*')
        self.word_count_by_document = None
        self.phrase_frequencies = None
def calculate_phrase_frequencies(self):
        """
       extract the sentence chunks according to PATTERN and calculate
       the frequency of chunks with pos tags
       """
        # pdb.set_trace()
        chunk_freq_dict = defaultdict(int)
        chunker = nltk.RegexpParser(self.PATTERN)
        for sent in self.corpus.tagged_sents():
            sent = [s for s in sent if s[1] is not None]
            for chk in chunker.parse(sent).subtrees():
                if str(chk).startswith('(NP'):                  
                    phrase = chk.__unicode__()[4:-1]
                    if '\n' in phrase:
                        phrase = ' '.join(phrase.split())
                    just_phrase = ' '.join([w.rsplit('/', 1)[0] for w in phrase.split(' ')])
                   # print(just_phrase)
                    chunk_freq_dict[just_phrase] += 1
        self.phrase_frequencies = chunk_freq_dict
        #print(self.phrase_frequencies)