Commit 05170a8f authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] The 'extract_ngrams' method on Node seems to be working!

Still needs to be fully tested.
parent b4fb7c9a
......@@ -75,8 +75,8 @@ class Node(CTENode):
def extract_ngrams(self, keys, cache):
# TODO: instanciate the ngrams extractors
# WHERE TO PUT THEIR CACHE?
extractor = extractor_cache[self.language.iso2]
ngrams = ngrams_cache[self.language.iso2]
extractor = cache.extractors[self.language.iso2]
ngrams = cache.ngrams[self.language]
# find & count all the occurrences
associations = defaultdict(float) # float or int?
if isinstance(keys, dict):
......
import collections
from node.models import Ngram
from parsing.NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor
class NgramsCache:
class NgramsCache(collections.defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time
"""
def __init__(self, language):
self._cache = dict()
self._language = language
self.language = language
def __getitem__(self, terms):
terms = terms.strip().lower()
if terms not in self._cache:
try:
ngram = Ngram.get(terms=terms, language=self._language)
except:
ngram = Ngram(terms=terms, n=len(terms.split()), language=self._language)
ngram.save()
self._cache[terms] = ngram
return self._cache[terms]
def __missing__(self, terms):
try:
ngram = Ngram.get(terms=terms, language=self.language)
except:
ngram = Ngram(terms=terms, n=len(terms.split()), language=self.language)
ngram.save()
self[terms] = ngram
return self[terms]
class NgramsCaches(collections.defaultdict):
def __missing__(self, language):
self[language] = NgramsCache(language)
return self[language]
class NgramsExtractorsCache(collections.defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time
"""
def __missing__(self, key):
# format the language
if isinstance(key, str):
language = key.strip().lower()
else:
language = key.iso3
# find the proper extractor
if language in ["en", "eng", "english"]:
Extractor = EnglishNgramsExtractor
elif language in ["fr", "fra", "fre", "french"]:
Extractor = FrenchNgramsExtractor
else:
Extractor = NgramsExtractor
# try to see if already instanciated, otherwise do it
found = False
for extractor in self.values():
if type(extractor) == Extractor:
self[key] = extractor
found = True
break
if not found:
self[key] = Extractor()
# return the proper extractor
return self[key]
class Cache:
"""This is THE cache of the caches."""
def __init__(self):
self.ngrams_caches = NgramsCaches()
self.
\ No newline at end of file
self.ngrams = NgramsCaches()
self.extractors = NgramsExtractorsCache()
......@@ -2,36 +2,3 @@ from parsing.NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
import collections
class NgramsExtractorsCache(collections.defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time
"""
def __missing__(self, key):
# format the language
if isinstance(key, str):
language = key.strip().lower()
else:
language = key.iso3
# find the proper extractor
if language in ["en", "eng", "english"]:
Extractor = EnglishNgramsExtractor
elif language in ["fr", "fra", "fre", "french"]:
Extractor = FrenchNgramsExtractor
else:
Extractor = NgramsExtractor
# try to see if already instanciated, otherwise do it
found = False
for extractor in self.values():
if type(extractor) == Extractor:
self[key] = extractor
found = True
break
if not found:
self[key] = Extractor()
# return the proper extractor
return self[key]
from node.models import Node, NodeType, User, Language
from parsing.NgramsExtractors import NgramsExtractorsCache
from parsing.Caches import Cache
try:
......@@ -39,8 +38,7 @@ except:
parent = corpus
).save()
extractor_cache = NgramsExtractorsCache()
ngrams_cache = NgramsCache()
cache = Cache()
for child in corpus.children.all():
print(child.id)
child.extract_ngrams(['title'], extractor_cache)
\ No newline at end of file
child.extract_ngrams(['title'], cache)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment