[OPTI] Using bulk_create for 'Node_Ngram' insertion

[FEATURE] The 'extract_ngrams' method is now available from the queryset as well [BUGFIX] The parsed resources are now marked as parsed in the database

[OPTI] Using bulk_create for 'Node_Ngram' insertion
[FEATURE] The 'extract_ngrams' method is now available from the queryset as well [BUGFIX] The parsed resources are now marked as parsed in the database
c02ab045 · Mathieu Rodic · b96a7eec · c02ab045 · c02ab045
Commit c02ab045 authored Oct 30, 2014 by Mathieu Rodic
Hide whitespace changes
Inline Side-by-side

Showing with 43 additions and 29 deletions

models.py node/models.py +38 -17

test-parsing_from_node.py test-parsing_from_node.py +5 -12

No files found.
--- a/node/models.py
+++ b/node/models.py
@@ -7,7 +7,7 @@ from django_hstore import hstore
 from cte_tree.models import CTENode, Manager
 #from cte_tree.fields import DepthField, PathField, OrderingField

-from parsing.Caches import LanguagesCache
+from parsing.Caches import LanguagesCache, NgramsExtractorsCache, NgramsCaches
 from parsing.FileParsers import *
 from time import time
 from collections import defaultdict
@@ -52,8 +52,29 @@ class NodeType(models.Model):
    def __str__(self):
        return self.name

+        
+class NodeQuerySet(models.query.QuerySet):
+    """Methods available from Node querysets."""
+    def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None):
+        if ngramsextractorscache is None:
+            ngramsextractorscache = NgramsExtractorsCache()
+        if ngramscaches is None:
+            ngramscaches = NgramsCaches()
+        for node in self:
+            node.extract_ngrams(keys, ngramsextractorscache, ngramscaches)
+    
+class NodeManager(models.Manager):
+    """Methods available from Node.object."""
+    def get_queryset(self):
+        return NodeQuerySet(self.model)
+    def __getattr__(self, name, *args):
+        if name.startswith("_"): 
+            raise AttributeError
+        return getattr(self.get_queryset(), name, *args)
+        
 class Node(CTENode):
-    objects     = Manager()
+    """The node."""
+    objects     = NodeManager()

    user        = models.ForeignKey(User)
    type        = models.ForeignKey(NodeType)
@@ -73,13 +94,6 @@ class Node(CTENode):
    def __str__(self):
        return self.name
    
-    # TODO: voir à quoi sert cette méthode
-    def liste(self, user):
-        for noeud in Node.objects.filter(user=user):
-            print(noeud.depth * "    " + "[%d] %d" % (noeud.pk, noeud.name))
-    
-    
-    
    
    def add_resource(self, **kwargs):
        resource = Resource(**kwargs)
@@ -103,9 +117,8 @@ class Node(CTENode):
                'ris'       : RisFileParser,
                'europress' : EuropressFileParser,
            })[resource.type.name]()
-            print(parser)
            metadata_list += parser.parse(str(resource.file))
-        # insert in the database!
+        # insert the new resources in the database!
        type = NodeType.objects.get(name='Document')
        langages_cache = LanguagesCache()
        Node.objects.bulk_create([
@@ -119,11 +132,18 @@ class Node(CTENode):
            )
            for metadata in metadata_list
        ])
+        # update resources status: say they are now parsed
+        self.node_resource.filter(parsed=False).update(parsed=True)
    
-    def extract_ngrams(self, keys, cache):
+    def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None):
+        # if there is no cache...
+        if ngramsextractorscache is None:
+            ngramsextractorscache = NgramsExtractorsCache()
+        if ngramscaches is None:
+            ngramscaches = NgramsCaches()
        # what do we want from the cache?
-        extractor = cache.extractors[self.language]
-        ngrams = cache.ngrams[self.language]
+        extractor = ngramsextractorscache[self.language]
+        ngrams = ngramscaches[self.language]
        # find & count all the occurrences
        associations = defaultdict(float) # float or int?
        if isinstance(keys, dict):
@@ -137,13 +157,14 @@ class Node(CTENode):
                    terms = ' '.join([token for token, tag in ngram])
                    associations[terms] += 1
        # insert the occurrences in the database
-        # TODO: use bulk_create instead
-        for ngram_text, weight in associations.items():
+        Node_Ngram.objects.bulk_create([
            Node_Ngram(
                node   = self,
                ngram  = ngrams[ngram_text],
                weight = weight
-            ).save()
+            )
+            for ngram_text, weight in associations.items()
+        ])


 class Node_Resource(models.Model):

--- a/test-parsing_from_node.py
+++ b/test-parsing_from_node.py
@@ -32,18 +32,11 @@ except:
    corpus = Node(name='My first corpus', type=typeCorpus, user=me)
    corpus.save()
    
-
+print('Remove previously existing children of the corpus...')
 corpus.children.all().delete()
+print('Adding a resource to the corpus...')
 corpus.add_resource(file='./data_samples/pubmed.zip', type=typePubmed)
+print('Adding the corpus resources...')
 corpus.parse_resources()
-
-cache = Caches()
-for child in corpus.children.all():
-    if child.language:
-        print('#%d\t%s\n%s\n' % (child.id, child.name, child.language.fullname))
-    else:
-        print('#%d\t%s\n\n' % (child.id, child.name))
-    # print(child.metadata)
-    # print()
-    child.extract_ngrams(['title', 'abstract'], cache)
-    # child.extract_ngrams({'title':1., 'abstract':.2}, cache)
\ No newline at end of file
+print('Extracting ngrams from the documents...')
+corpus.children.all().extract_ngrams(['title', 'abstract'])