[FEATURE] The 'extract_ngrams' method on Node seems to be working!

Still needs to be fully tested.

[FEATURE] The 'extract_ngrams' method on Node seems to be working!
Still needs to be fully tested.
05170a8f · Mathieu Rodic · b4fb7c9a · 05170a8f · 05170a8f · 05170a8f
Commit 05170a8f authored Oct 28, 2014 by Mathieu Rodic
Showing with 53 additions and 57 deletions

models.py node/models.py +2 -2

Caches.py parsing/Caches.py +49 -18

__init__.py parsing/NgramsExtractors/__init__.py +0 -33

test-parsing_from_node.py test-parsing_from_node.py +2 -4

No files found.
--- a/node/models.py
+++ b/node/models.py
@@ -75,8 +75,8 @@ class Node(CTENode):
    def extract_ngrams(self, keys, cache):
        # TODO: instanciate the ngrams extractors
        # WHERE TO PUT THEIR CACHE?
-        extractor = extractor_cache[self.language.iso2]
+        extractor = cache.extractors[self.language.iso2]
-        ngrams = ngrams_cache[self.language.iso2]
+        ngrams = cache.ngrams[self.language]
        # find & count all the occurrences
        associations = defaultdict(float) # float or int?
        if isinstance(keys, dict):

--- a/parsing/Caches.py
+++ b/parsing/Caches.py
+import collections
+from node.models import Ngram
+from parsing.NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor
-class NgramsCache:
+class NgramsCache(collections.defaultdict):
    """This allows the fast retrieval of ngram ids
    from a cache instead of calling the database every time
    """
    def __init__(self, language):
-        self._cache = dict()
+        self.language = language
-        self._language = language
-    def __getitem__(self, terms):
+    def __missing__(self, terms):
-        terms = terms.strip().lower()
-        if terms not in self._cache:
        try:
-                ngram = Ngram.get(terms=terms, language=self._language)
+            ngram = Ngram.get(terms=terms, language=self.language)
        except:
-                ngram = Ngram(terms=terms, n=len(terms.split()), language=self._language)
+            ngram = Ngram(terms=terms, n=len(terms.split()), language=self.language)
            ngram.save()
-            self._cache[terms] = ngram
+        self[terms] = ngram
-        return self._cache[terms]
+        return self[terms]
 class NgramsCaches(collections.defaultdict):
@@ -27,9 +27,40 @@ class NgramsCaches(collections.defaultdict):
        self[language] = NgramsCache(language)
        return self[language]
+class NgramsExtractorsCache(collections.defaultdict):
+    """This allows the fast retrieval of ngram ids
+    from a cache instead of calling the database every time
+    """
+    def __missing__(self, key):
+        # format the language
+        if isinstance(key, str):
+            language = key.strip().lower()
+        else:
+            language = key.iso3
+        # find the proper extractor
+        if language in ["en", "eng", "english"]:
+            Extractor = EnglishNgramsExtractor
+        elif language in ["fr", "fra", "fre", "french"]:
+            Extractor = FrenchNgramsExtractor
+        else:
+            Extractor = NgramsExtractor
+        # try to see if already instanciated, otherwise do it
+        found = False
+        for extractor in self.values():
+            if type(extractor) == Extractor:
+                self[key] = extractor
+                found = True
+                break
+        if not found:
+            self[key] = Extractor()
+        # return the proper extractor
+        return self[key]
-class Cache:
+class Cache:
+    """This is THE cache of the caches."""
    def __init__(self):
-        self.ngrams_caches = NgramsCaches()
+        self.ngrams = NgramsCaches()
-        self.
+        self.extractors = NgramsExtractorsCache()
\ No newline at end of file
--- a/parsing/NgramsExtractors/__init__.py
+++ b/parsing/NgramsExtractors/__init__.py
@@ -2,36 +2,3 @@ from parsing.NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
 from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
 from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
-import collections
-class NgramsExtractorsCache(collections.defaultdict):
-    """This allows the fast retrieval of ngram ids
-    from a cache instead of calling the database every time
-    """
-    def __missing__(self, key):
-        # format the language
-        if isinstance(key, str):
-            language = key.strip().lower()
-        else:
-            language = key.iso3
-        # find the proper extractor
-        if language in ["en", "eng", "english"]:
-            Extractor = EnglishNgramsExtractor
-        elif language in ["fr", "fra", "fre", "french"]:
-            Extractor = FrenchNgramsExtractor
-        else:
-            Extractor = NgramsExtractor
-        # try to see if already instanciated, otherwise do it
-        found = False
-        for extractor in self.values():
-            if type(extractor) == Extractor:
-                self[key] = extractor
-                found = True
-                break
-        if not found:
-            self[key] = Extractor()
-        # return the proper extractor
-        return self[key]
--- a/test-parsing_from_node.py
+++ b/test-parsing_from_node.py
 from node.models import Node, NodeType, User, Language
-from parsing.NgramsExtractors import NgramsExtractorsCache
 from parsing.Caches import Cache
 try:
@@ -39,8 +38,7 @@ except:
            parent      = corpus
        ).save()
-extractor_cache = NgramsExtractorsCache()
+cache = Cache()
-ngrams_cache = NgramsCache()
 for child in corpus.children.all():
    print(child.id)
-    child.extract_ngrams(['title'], extractor_cache)
+    child.extract_ngrams(['title'], cache)
\ No newline at end of file