Commit 45addafc authored by Mathieu Rodic's avatar Mathieu Rodic

Well, the PubMed files parser's development is over...

...now for some happy debugging.
parent be4cd18b
import Collections
import collections
# This allows the fast retrieval of ngram ids
# from the cache instead of using the database for every call
class Ngram_Cache:
def __init__(self):
self._cache = {}
def get(self, terms):
terms = terms.strip().lower()
if terms not in self._cache:
try:
ngram = NGram.get(terms=terms)
except:
ngram = NGram(terms=terms, n=len(terms))
ngram.save()
self._cache[terms] = ngram
return self._cache[terms]
"""Base class for performing files parsing depending on their type.
"""
......@@ -10,14 +30,16 @@ class FileParser:
self._file = open(filepath, "rb")
else:
self._file = file
# ...and parse!
self.parse()
# cache for ngrams
self._ngram_caches = collections.defaultdicts(Ngram_Cache)
# extractors
self._extractors = {}
self._document_nodetype = NodeType.get(label='document')
with Language.objects.all() as languages:
self._languages_iso2 = {language.iso2.lower(): language for language in Language}
self._languages_iso3 = {language.iso3.lower(): language for language in Language}
# ...and parse!
self.parse()
"""Extract the ngrams from a given text.
"""
......@@ -25,18 +47,20 @@ class FileParser:
# Get the appropriate ngrams extractor, if it exists
if language not in self._extractors:
extractor = None
if language == 'en':
if language.iso2 == 'en':
extractor = EnglishNgramsExtractor()
elif language == 'fr':
elif language.iso2 == 'fr':
extractor = FrenchNgramsExtractor()
self._extractors[language] = extractor
else:
extractor = self._extractors[language]
# Extract the
# Extract the ngrams
if extractor:
return extractor.extract_ngrams(text)
return collections.Counter(
[token for token, tag in extractor.extract_ngrams(text)]
)
else:
return []
return {}
"""Add a document to the database.
"""
......@@ -62,7 +86,17 @@ class FileParser:
# parse it!
ngrams = self.extract_ngrams(contents, language)
for
# we should already be in a transaction, so no use doing another one (or is there?)
# btw, this is not very good (the get/insert part)
ngram_cache = self._ngram_caches[language.iso3]
for ngram_text, count in ngrams.items():
ngram = ngram_cache.get(ngram_text)
Node_Ngram(
node = childNode,
ngram = ngram,
count = count
)
# return the created document
return document
......
......@@ -8,23 +8,7 @@ import Collections
# import chardet
# This allows the fast retrieval of ngram ids
# from the cache instead of using the database
class Ngram_Cache:
def __init__(self):
self._cache = {}
def get(self, terms):
terms = terms.strip().lower()
if terms not in self._cache:
try:
ngram = NGram.get(terms=terms)
except:
ngram = NGram(terms=terms, n=len(terms))
ngram.save()
self._cache[terms] = ngram.pk
return self._cache[terms]
......
......@@ -85,6 +85,9 @@ class Node(MP_Node):
file = models.FileField(upload_to=upload_to, blank=True)
resource = models.ForeignKey(Resource)
ngrams = models.ManyToManyField(NGrams)
#objects = hstore.HStoreManager()
def __str__(self):
return self.name
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment