Commit 05170a8f authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] The 'extract_ngrams' method on Node seems to be working!

Still needs to be fully tested.
parent b4fb7c9a
...@@ -75,8 +75,8 @@ class Node(CTENode): ...@@ -75,8 +75,8 @@ class Node(CTENode):
def extract_ngrams(self, keys, cache): def extract_ngrams(self, keys, cache):
# TODO: instanciate the ngrams extractors # TODO: instanciate the ngrams extractors
# WHERE TO PUT THEIR CACHE? # WHERE TO PUT THEIR CACHE?
extractor = extractor_cache[self.language.iso2] extractor = cache.extractors[self.language.iso2]
ngrams = ngrams_cache[self.language.iso2] ngrams = cache.ngrams[self.language]
# find & count all the occurrences # find & count all the occurrences
associations = defaultdict(float) # float or int? associations = defaultdict(float) # float or int?
if isinstance(keys, dict): if isinstance(keys, dict):
......
import collections
from node.models import Ngram
from parsing.NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor
class NgramsCache:
class NgramsCache(collections.defaultdict):
"""This allows the fast retrieval of ngram ids """This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time from a cache instead of calling the database every time
""" """
def __init__(self, language): def __init__(self, language):
self._cache = dict() self.language = language
self._language = language
def __getitem__(self, terms): def __missing__(self, terms):
terms = terms.strip().lower()
if terms not in self._cache:
try: try:
ngram = Ngram.get(terms=terms, language=self._language) ngram = Ngram.get(terms=terms, language=self.language)
except: except:
ngram = Ngram(terms=terms, n=len(terms.split()), language=self._language) ngram = Ngram(terms=terms, n=len(terms.split()), language=self.language)
ngram.save() ngram.save()
self._cache[terms] = ngram self[terms] = ngram
return self._cache[terms] return self[terms]
class NgramsCaches(collections.defaultdict): class NgramsCaches(collections.defaultdict):
...@@ -27,9 +27,40 @@ class NgramsCaches(collections.defaultdict): ...@@ -27,9 +27,40 @@ class NgramsCaches(collections.defaultdict):
self[language] = NgramsCache(language) self[language] = NgramsCache(language)
return self[language] return self[language]
class NgramsExtractorsCache(collections.defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time
"""
def __missing__(self, key):
# format the language
if isinstance(key, str):
language = key.strip().lower()
else:
language = key.iso3
# find the proper extractor
if language in ["en", "eng", "english"]:
Extractor = EnglishNgramsExtractor
elif language in ["fr", "fra", "fre", "french"]:
Extractor = FrenchNgramsExtractor
else:
Extractor = NgramsExtractor
# try to see if already instanciated, otherwise do it
found = False
for extractor in self.values():
if type(extractor) == Extractor:
self[key] = extractor
found = True
break
if not found:
self[key] = Extractor()
# return the proper extractor
return self[key]
class Cache:
class Cache:
"""This is THE cache of the caches."""
def __init__(self): def __init__(self):
self.ngrams_caches = NgramsCaches() self.ngrams = NgramsCaches()
self. self.extractors = NgramsExtractorsCache()
\ No newline at end of file
...@@ -2,36 +2,3 @@ from parsing.NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor ...@@ -2,36 +2,3 @@ from parsing.NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
import collections
class NgramsExtractorsCache(collections.defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time
"""
def __missing__(self, key):
# format the language
if isinstance(key, str):
language = key.strip().lower()
else:
language = key.iso3
# find the proper extractor
if language in ["en", "eng", "english"]:
Extractor = EnglishNgramsExtractor
elif language in ["fr", "fra", "fre", "french"]:
Extractor = FrenchNgramsExtractor
else:
Extractor = NgramsExtractor
# try to see if already instanciated, otherwise do it
found = False
for extractor in self.values():
if type(extractor) == Extractor:
self[key] = extractor
found = True
break
if not found:
self[key] = Extractor()
# return the proper extractor
return self[key]
from node.models import Node, NodeType, User, Language from node.models import Node, NodeType, User, Language
from parsing.NgramsExtractors import NgramsExtractorsCache
from parsing.Caches import Cache from parsing.Caches import Cache
try: try:
...@@ -39,8 +38,7 @@ except: ...@@ -39,8 +38,7 @@ except:
parent = corpus parent = corpus
).save() ).save()
extractor_cache = NgramsExtractorsCache() cache = Cache()
ngrams_cache = NgramsCache()
for child in corpus.children.all(): for child in corpus.children.all():
print(child.id) print(child.id)
child.extract_ngrams(['title'], extractor_cache) child.extract_ngrams(['title'], cache)
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment