Minor corrections...

be8928f9 · Mathieu Rodic · 712f8f0a · be8928f9
Commit be8928f9 authored Oct 19, 2014 by Mathieu Rodic
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 6 deletions

FileParser.py mat-parsing/FileParsers/FileParser.py +7 -6

No files found.
--- a/mat-parsing/FileParsers/FileParser.py
+++ b/mat-parsing/FileParsers/FileParser.py
@@ -3,13 +3,13 @@ import collections
 # This allows the fast retrieval of ngram ids
 # from the cache instead of using the database for every call
-class Ngram_Cache:
+class NgramCache:
    def __init__(self, language):
        self._cache = {}
        self._language = language
-    def get(self, terms):
+    def __getitem__(self, terms):
        terms = terms.strip().lower()
        if terms not in self._cache:
            try:
@@ -32,7 +32,7 @@ class FileParser:
        else:
            self._file = file
        # cache for ngrams
-        self._ngram_caches = collections.defaultdicts(Ngram_Cache)
+        self._ngramcaches = collections.defaultdicts(NgramCache)
        # extractors
        self._extractors = {}
        self._document_nodetype = NodeType.get(label='document')
@@ -91,9 +91,10 @@ class FileParser:
        # parse it!
        ngrams = self.extract_ngrams(contents, language)
        # we should already be in a transaction, so no use doing another one (or is there?)
-        ngram_cache = self._ngram_caches[language.iso3]
+        ngramcache = self._ngramcaches[language.iso3]
-        for ngram_text, occurences in ngrams.items():
+        for terms, occurences in ngrams.items():
-            ngram = ngram_cache.get(ngram_text)
+            ngram_text = ' '.join([term[0] for term in terms])
+            ngram = ngramcache[ngram_text]
            Node_Ngram(
                node       = childNode,
                ngram      = ngram,