Commit 07a6f374 authored by Mathieu Rodic's avatar Mathieu Rodic

Minor corrections on the FileParser class

parent be8928f9
...@@ -20,6 +20,14 @@ class NgramCache: ...@@ -20,6 +20,14 @@ class NgramCache:
self._cache[terms] = ngram self._cache[terms] = ngram
return self._cache[terms] return self._cache[terms]
class NgramCaches(collections.defaultdict):
def __missing__(self, language):
self[language] = NgramCache(language)
return self[language]
"""Base class for performing files parsing depending on their type. """Base class for performing files parsing depending on their type.
""" """
...@@ -32,7 +40,7 @@ class FileParser: ...@@ -32,7 +40,7 @@ class FileParser:
else: else:
self._file = file self._file = file
# cache for ngrams # cache for ngrams
self._ngramcaches = collections.defaultdicts(NgramCache) self._ngramcaches = NgramCaches()
# extractors # extractors
self._extractors = {} self._extractors = {}
self._document_nodetype = NodeType.get(label='document') self._document_nodetype = NodeType.get(label='document')
...@@ -90,8 +98,8 @@ class FileParser: ...@@ -90,8 +98,8 @@ class FileParser:
# parse it! # parse it!
ngrams = self.extract_ngrams(contents, language) ngrams = self.extract_ngrams(contents, language)
# we should already be in a transaction, so no use doing another one (or is there?) # we are already in a transaction, so no use doing another one (or is there?)
ngramcache = self._ngramcaches[language.iso3] ngramcache = self._ngramcaches[language]
for terms, occurences in ngrams.items(): for terms, occurences in ngrams.items():
ngram_text = ' '.join([term[0] for term in terms]) ngram_text = ' '.join([term[0] for term in terms])
ngram = ngramcache[ngram_text] ngram = ngramcache[ngram_text]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment