Commit 0f584cf8 authored by Administrator's avatar Administrator

[FEAT] Adding Stems for each ngrams during extraction (in Caches).

parent fcf87c16
...@@ -61,6 +61,14 @@ except Exception as error: ...@@ -61,6 +61,14 @@ except Exception as error:
typeDoc = NodeType(name='Document') typeDoc = NodeType(name='Document')
typeDoc.save() typeDoc.save()
try:
typeStem = NodeType.objects.get(name='Stem')
except Exception as error:
print(error)
typeStem = NodeType(name='Stem')
typeStem.save()
try: try:
typeDoc = NodeType.objects.get(name='WhiteList') typeDoc = NodeType.objects.get(name='WhiteList')
except Exception as error: except Exception as error:
...@@ -132,4 +140,10 @@ except: ...@@ -132,4 +140,10 @@ except:
project = Node(name='Bees project', type=typeProject, user=me) project = Node(name='Bees project', type=typeProject, user=me)
project.save() project.save()
try:
stem = Node.objects.get(name='Stem')
except:
stem = Node(name='Stem', type=typeStem, user=me)
stem.save()
...@@ -3,6 +3,13 @@ from parsing.NgramsExtractors import * ...@@ -3,6 +3,13 @@ from parsing.NgramsExtractors import *
from collections import defaultdict from collections import defaultdict
from nltk.stem.porter import PorterStemmer
st = PorterStemmer()
def stem_all(ngram):
return " ".join(map(lambda x: st.stem(x), ngram.split(" ")))
class NgramsCache(defaultdict): class NgramsCache(defaultdict):
"""This allows the fast retrieval of ngram ids """This allows the fast retrieval of ngram ids
...@@ -16,13 +23,31 @@ class NgramsCache(defaultdict): ...@@ -16,13 +23,31 @@ class NgramsCache(defaultdict):
def __missing__(self, terms): def __missing__(self, terms):
"""If the terms are not yet present in the dictionary, """If the terms are not yet present in the dictionary,
retrieve it from the database or insert it.""" retrieve it from the database or insert it."""
terms = terms.strip().lower() terms = terms.strip().lower()
try: try:
ngram = node.models.Ngram.objects.get(terms=terms, language=self.language) ngram = node.models.Ngram.objects.get(terms=terms, language=self.language)
except Exception as error: except Exception as error:
print(error)
ngram = node.models.Ngram(terms=terms, n=len(terms.split()), language=self.language) ngram = node.models.Ngram(terms=terms, n=len(terms.split()), language=self.language)
ngram.save() ngram.save()
stem_terms = stem_all(ngram.terms)
try:
stem = node.models.Ngram.objects.get(terms=stem_terms, language=ngram.language, n=ngram.n)
except:
stem = node.models.Ngram(terms=stem_terms, language=ngram.language, n=ngram.n)
stem.save()
type_stem = node.models.NodeType.objects.get(name='Stem')
node_stem = node.models.Node.objects.get(name='Stem', type=type_stem)
try:
node.models.NodeNgramNgram.objects.get(node=node_stem, ngramx=stem, ngramy=ngram)
except:
node.models.NodeNgramNgram(node=node_stem, ngramx=stem, ngramy=ngram, score=1).save()
self[terms] = ngram self[terms] = ngram
return self[terms] return self[terms]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment