Commit b4fb7c9a authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] Started putting ngrams extraction under 'Node'

parent b8c609db
...@@ -72,10 +72,34 @@ class Node(CTENode): ...@@ -72,10 +72,34 @@ class Node(CTENode):
for noeud in Node.objects.filter(user=user): for noeud in Node.objects.filter(user=user):
print(noeud.depth * " " + "[%d] %d" % (noeud.pk, noeud.name)) print(noeud.depth * " " + "[%d] %d" % (noeud.pk, noeud.name))
def extract_ngrams(self, keys, cache):
# TODO: instanciate the ngrams extractors
# WHERE TO PUT THEIR CACHE?
extractor = extractor_cache[self.language.iso2]
ngrams = ngrams_cache[self.language.iso2]
# find & count all the occurrences
associations = defaultdict(float) # float or int?
if isinstance(keys, dict):
for key, weight in keys.items():
for ngram in extractor.extract_ngrams(self.metadata[key]):
associations[key] += weight
else:
for key in keys:
for ngram in extractor.extract_ngrams(self.metadata[key]):
associations[key] += 1
# insert the occurrences in the database
for ngram_text, weight in associations.items():
Node_Ngram(
node = self,
ngram = ngrams[ngram_text],
weight = weight
)
class Node_Ngram(models.Model): class Node_Ngram(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE) node = models.ForeignKey(Node, on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE) ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
occurences = models.IntegerField() weight = models.IntegerField()
class Project(Node): class Project(Node):
class Meta: class Meta:
......
class NgramsCache:
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time
"""
def __init__(self, language):
self._cache = dict()
self._language = language
def __getitem__(self, terms):
terms = terms.strip().lower()
if terms not in self._cache:
try:
ngram = Ngram.get(terms=terms, language=self._language)
except:
ngram = Ngram(terms=terms, n=len(terms.split()), language=self._language)
ngram.save()
self._cache[terms] = ngram
return self._cache[terms]
class NgramsCaches(collections.defaultdict):
def __missing__(self, language):
self[language] = NgramsCache(language)
return self[language]
class Cache:
def __init__(self):
self.ngrams_caches = NgramsCaches()
self.
\ No newline at end of file
...@@ -6,32 +6,7 @@ import dateutil.parser ...@@ -6,32 +6,7 @@ import dateutil.parser
import zipfile import zipfile
class NgramCache:
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time
"""
def __init__(self, language):
self._cache = dict()
self._language = language
def __getitem__(self, terms):
terms = terms.strip().lower()
if terms not in self._cache:
try:
ngram = Ngram.get(terms=terms, language=self._language)
except:
ngram = Ngram(terms=terms, n=len(terms.split()), language=self._language)
ngram.save()
self._cache[terms] = ngram
return self._cache[terms]
class NgramCaches(collections.defaultdict):
def __missing__(self, language):
self[language] = NgramCache(language)
return self[language]
......
from node.models import Node, NodeType, User, Language
from parsing.NgramsExtractors import NgramsExtractorsCache
from parsing.Caches import Cache
try:
me = User.objects.get(username='Mat')
except:
me = User(username='Mat')
me.save()
try:
typeCorpus = NodeType.get(name='corpus')
typeDoc = NodeType.get(name='document')
except:
typeCorpus = NodeType(name='corpus')
typeCorpus.save()
typeDoc = NodeType(name='document')
typeDoc.save()
english = Language.objects.get(iso2='en')
Node.objects.all().delete()
try:
corpus = Node.objects.get(name='My first corpus')
except:
corpus = Node(name='My first corpus', type=typeCorpus, user=me)
corpus.save()
for i in range(64):
title = 'Document #%d' % i
Node(
user = me,
# type = self._document_nodetype,
name = title,
language = english,
metadata = {'title':title},
#resource = resource,
type = typeDoc,
parent = corpus
).save()
extractor_cache = NgramsExtractorsCache()
ngrams_cache = NgramsCache()
for child in corpus.children.all():
print(child.id)
child.extract_ngrams(['title'], extractor_cache)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment