Commit 0f584cf8 authored by Administrator's avatar Administrator

[FEAT] Adding Stems for each ngrams during extraction (in Caches).

parent fcf87c16
......@@ -61,6 +61,14 @@ except Exception as error:
typeDoc = NodeType(name='Document')
typeDoc.save()
try:
typeStem = NodeType.objects.get(name='Stem')
except Exception as error:
print(error)
typeStem = NodeType(name='Stem')
typeStem.save()
try:
typeDoc = NodeType.objects.get(name='WhiteList')
except Exception as error:
......@@ -132,4 +140,10 @@ except:
project = Node(name='Bees project', type=typeProject, user=me)
project.save()
try:
stem = Node.objects.get(name='Stem')
except:
stem = Node(name='Stem', type=typeStem, user=me)
stem.save()
......@@ -3,6 +3,13 @@ from parsing.NgramsExtractors import *
from collections import defaultdict
from nltk.stem.porter import PorterStemmer
st = PorterStemmer()
def stem_all(ngram):
return " ".join(map(lambda x: st.stem(x), ngram.split(" ")))
class NgramsCache(defaultdict):
"""This allows the fast retrieval of ngram ids
......@@ -16,13 +23,31 @@ class NgramsCache(defaultdict):
def __missing__(self, terms):
"""If the terms are not yet present in the dictionary,
retrieve it from the database or insert it."""
terms = terms.strip().lower()
try:
ngram = node.models.Ngram.objects.get(terms=terms, language=self.language)
except Exception as error:
print(error)
ngram = node.models.Ngram(terms=terms, n=len(terms.split()), language=self.language)
ngram.save()
stem_terms = stem_all(ngram.terms)
try:
stem = node.models.Ngram.objects.get(terms=stem_terms, language=ngram.language, n=ngram.n)
except:
stem = node.models.Ngram(terms=stem_terms, language=ngram.language, n=ngram.n)
stem.save()
type_stem = node.models.NodeType.objects.get(name='Stem')
node_stem = node.models.Node.objects.get(name='Stem', type=type_stem)
try:
node.models.NodeNgramNgram.objects.get(node=node_stem, ngramx=stem, ngramy=ngram)
except:
node.models.NodeNgramNgram(node=node_stem, ngramx=stem, ngramy=ngram, score=1).save()
self[terms] = ngram
return self[terms]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment