Commit 60b36dcc authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] Ngrams extractors - Implemented a cache for the extractors

parent f6122f1c
......@@ -7,3 +7,4 @@ class EnglishNgramsExtractor(NgramsExtractor):
def start(self):
self.tagger = NltkTagger()
\ No newline at end of file
#from NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
#from NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from parsing.NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
import collections
class NgramsExtractorsCache(collections.defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time
"""
def __missing__(self, key):
# format the language
if isinstance(key, str):
language = key.strip().lower()
else:
language = key.iso3
# find the proper extractor
if language in ["en", "eng", "english"]:
Extractor = EnglishNgramsExtractor
elif language in ["fr", "fra", "fre", "french"]:
Extractor = FrenchNgramsExtractor
else:
Extractor = NgramsExtractor
# try to see if already instanciated, otherwise do it
found = False
for extractor in self.values():
if type(extractor) == Extractor:
self[key] = extractor
found = True
break
if not found:
self[key] = Extractor()
# return the proper extractor
return self[key]
......@@ -46,7 +46,7 @@ Shall be used for french texts.
"""
class TreeTagger(Tagger):
def start(self, treeTaggerPath = "./Taggers/treetagger"):
def start(self, treeTaggerPath = "./parsing/Taggers/treetagger"):
binaryFile = "%s/bin/tree-tagger" % treeTaggerPath
tagcmdlist = [
binaryFile,
......
from parsing.Taggers.Tagger import Tagger
from parsing.Taggers.NltkTagger import NltkTagger
from parsing.Taggers.TreeTagger import TreeTagger
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment