Well, the PubMed files parser's development is over...

...now for some happy debugging.

Well, the PubMed files parser's development is over...
...now for some happy debugging.
45addafc · Mathieu Rodic · be4cd18b · 45addafc · 45addafc · 45addafc
Commit 45addafc authored 10 years ago by Mathieu Rodic
Hide whitespace changes
Inline Side-by-side

Showing with 47 additions and 26 deletions

FileParser.py mat-parsing/FileParsers/FileParser.py +43 -9

__init__.py mat-parsing/__init__.py +1 -17

models.py node/models.py +3 -0

No files found.
--- a/mat-parsing/FileParsers/FileParser.py
+++ b/mat-parsing/FileParsers/FileParser.py
-import Collections
+import collections
+
+
+# This allows the fast retrieval of ngram ids
+# from the cache instead of using the database for every call
+class Ngram_Cache:
+    
+    def __init__(self):
+        self._cache = {}
+            
+    def get(self, terms):
+        terms = terms.strip().lower()
+        if terms not in self._cache:
+            try:
+                ngram = NGram.get(terms=terms)
+            except:
+                ngram = NGram(terms=terms, n=len(terms))
+                ngram.save()
+            self._cache[terms] = ngram
+        return self._cache[terms]
+

 """Base class for performing files parsing depending on their type.
 """
@@ -10,14 +30,16 @@ class FileParser:
            self._file = open(filepath, "rb")
        else:
            self._file = file
-        # ...and parse!
-        self.parse()
+        # cache for ngrams
+        self._ngram_caches = collections.defaultdicts(Ngram_Cache)
        # extractors
        self._extractors = {}
        self._document_nodetype = NodeType.get(label='document')
        with Language.objects.all() as languages:
            self._languages_iso2 = {language.iso2.lower(): language for language in Language}
            self._languages_iso3 = {language.iso3.lower(): language for language in Language}
+        # ...and parse!
+        self.parse()
    
    """Extract the ngrams from a given text.
    """
@@ -25,18 +47,20 @@ class FileParser:
        # Get the appropriate ngrams extractor, if it exists
        if language not in self._extractors:
            extractor = None
-            if language == 'en':
+            if language.iso2 == 'en':
                extractor = EnglishNgramsExtractor()
-            elif language == 'fr':
+            elif language.iso2 == 'fr':
                extractor = FrenchNgramsExtractor()
            self._extractors[language] = extractor
        else:
            extractor = self._extractors[language]
-        # Extract the 
+        # Extract the ngrams
        if extractor:
-            return extractor.extract_ngrams(text)
+            return collections.Counter(
+                [token for token, tag in extractor.extract_ngrams(text)]
+            )
        else:
-            return []
+            return {}
    
    """Add a document to the database.
    """
@@ -62,7 +86,17 @@ class FileParser:
        
        # parse it!
        ngrams = self.extract_ngrams(contents, language)
-        for 
+        # we should already be in a transaction, so no use doing another one (or is there?)
+        # btw, this is not very good (the get/insert part)
+        ngram_cache = self._ngram_caches[language.iso3]
+        for ngram_text, count in ngrams.items():
+            ngram = ngram_cache.get(ngram_text)
+            Node_Ngram(
+                node  = childNode,
+                ngram = ngram,
+                count = count
+            )
+                
        # return the created document
        return document
    

--- a/mat-parsing/__init__.py
+++ b/mat-parsing/__init__.py
@@ -8,23 +8,7 @@ import Collections
 # import chardet


-# This allows the fast retrieval of ngram ids
-# from the cache instead of using the database
-class Ngram_Cache:
-    
-    def __init__(self):
-        self._cache = {}
-            
-    def get(self, terms):
-        terms = terms.strip().lower()
-        if terms not in self._cache:
-            try:
-                ngram = NGram.get(terms=terms)
-            except:
-                ngram = NGram(terms=terms, n=len(terms))
-                ngram.save()
-            self._cache[terms] = ngram.pk
-        return self._cache[terms]
+




--- a/node/models.py
+++ b/node/models.py
@@ -85,6 +85,9 @@ class Node(MP_Node):
    file        = models.FileField(upload_to=upload_to, blank=True)
    resource    = models.ForeignKey(Resource)
    
+    ngrams      = models.ManyToManyField(NGrams)
+    
+    
    #objects    = hstore.HStoreManager()
    def __str__(self):
        return self.name