Commit c02ab045 authored by Mathieu Rodic's avatar Mathieu Rodic

[OPTI] Using bulk_create for 'Node_Ngram' insertion

[FEATURE] The 'extract_ngrams' method is now available from the queryset as well
[BUGFIX] The parsed resources are now marked as parsed in the database
parent b96a7eec
...@@ -7,7 +7,7 @@ from django_hstore import hstore ...@@ -7,7 +7,7 @@ from django_hstore import hstore
from cte_tree.models import CTENode, Manager from cte_tree.models import CTENode, Manager
#from cte_tree.fields import DepthField, PathField, OrderingField #from cte_tree.fields import DepthField, PathField, OrderingField
from parsing.Caches import LanguagesCache from parsing.Caches import LanguagesCache, NgramsExtractorsCache, NgramsCaches
from parsing.FileParsers import * from parsing.FileParsers import *
from time import time from time import time
from collections import defaultdict from collections import defaultdict
...@@ -52,8 +52,29 @@ class NodeType(models.Model): ...@@ -52,8 +52,29 @@ class NodeType(models.Model):
def __str__(self): def __str__(self):
return self.name return self.name
class NodeQuerySet(models.query.QuerySet):
"""Methods available from Node querysets."""
def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None):
if ngramsextractorscache is None:
ngramsextractorscache = NgramsExtractorsCache()
if ngramscaches is None:
ngramscaches = NgramsCaches()
for node in self:
node.extract_ngrams(keys, ngramsextractorscache, ngramscaches)
class NodeManager(models.Manager):
"""Methods available from Node.object."""
def get_queryset(self):
return NodeQuerySet(self.model)
def __getattr__(self, name, *args):
if name.startswith("_"):
raise AttributeError
return getattr(self.get_queryset(), name, *args)
class Node(CTENode): class Node(CTENode):
objects = Manager() """The node."""
objects = NodeManager()
user = models.ForeignKey(User) user = models.ForeignKey(User)
type = models.ForeignKey(NodeType) type = models.ForeignKey(NodeType)
...@@ -73,13 +94,6 @@ class Node(CTENode): ...@@ -73,13 +94,6 @@ class Node(CTENode):
def __str__(self): def __str__(self):
return self.name return self.name
# TODO: voir à quoi sert cette méthode
def liste(self, user):
for noeud in Node.objects.filter(user=user):
print(noeud.depth * " " + "[%d] %d" % (noeud.pk, noeud.name))
def add_resource(self, **kwargs): def add_resource(self, **kwargs):
resource = Resource(**kwargs) resource = Resource(**kwargs)
...@@ -103,9 +117,8 @@ class Node(CTENode): ...@@ -103,9 +117,8 @@ class Node(CTENode):
'ris' : RisFileParser, 'ris' : RisFileParser,
'europress' : EuropressFileParser, 'europress' : EuropressFileParser,
})[resource.type.name]() })[resource.type.name]()
print(parser)
metadata_list += parser.parse(str(resource.file)) metadata_list += parser.parse(str(resource.file))
# insert in the database! # insert the new resources in the database!
type = NodeType.objects.get(name='Document') type = NodeType.objects.get(name='Document')
langages_cache = LanguagesCache() langages_cache = LanguagesCache()
Node.objects.bulk_create([ Node.objects.bulk_create([
...@@ -119,11 +132,18 @@ class Node(CTENode): ...@@ -119,11 +132,18 @@ class Node(CTENode):
) )
for metadata in metadata_list for metadata in metadata_list
]) ])
# update resources status: say they are now parsed
def extract_ngrams(self, keys, cache): self.node_resource.filter(parsed=False).update(parsed=True)
def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None):
# if there is no cache...
if ngramsextractorscache is None:
ngramsextractorscache = NgramsExtractorsCache()
if ngramscaches is None:
ngramscaches = NgramsCaches()
# what do we want from the cache? # what do we want from the cache?
extractor = cache.extractors[self.language] extractor = ngramsextractorscache[self.language]
ngrams = cache.ngrams[self.language] ngrams = ngramscaches[self.language]
# find & count all the occurrences # find & count all the occurrences
associations = defaultdict(float) # float or int? associations = defaultdict(float) # float or int?
if isinstance(keys, dict): if isinstance(keys, dict):
...@@ -137,13 +157,14 @@ class Node(CTENode): ...@@ -137,13 +157,14 @@ class Node(CTENode):
terms = ' '.join([token for token, tag in ngram]) terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1 associations[terms] += 1
# insert the occurrences in the database # insert the occurrences in the database
# TODO: use bulk_create instead Node_Ngram.objects.bulk_create([
for ngram_text, weight in associations.items():
Node_Ngram( Node_Ngram(
node = self, node = self,
ngram = ngrams[ngram_text], ngram = ngrams[ngram_text],
weight = weight weight = weight
).save() )
for ngram_text, weight in associations.items()
])
class Node_Resource(models.Model): class Node_Resource(models.Model):
......
...@@ -32,18 +32,11 @@ except: ...@@ -32,18 +32,11 @@ except:
corpus = Node(name='My first corpus', type=typeCorpus, user=me) corpus = Node(name='My first corpus', type=typeCorpus, user=me)
corpus.save() corpus.save()
print('Remove previously existing children of the corpus...')
corpus.children.all().delete() corpus.children.all().delete()
print('Adding a resource to the corpus...')
corpus.add_resource(file='./data_samples/pubmed.zip', type=typePubmed) corpus.add_resource(file='./data_samples/pubmed.zip', type=typePubmed)
print('Adding the corpus resources...')
corpus.parse_resources() corpus.parse_resources()
print('Extracting ngrams from the documents...')
cache = Caches() corpus.children.all().extract_ngrams(['title', 'abstract'])
for child in corpus.children.all():
if child.language:
print('#%d\t%s\n%s\n' % (child.id, child.name, child.language.fullname))
else:
print('#%d\t%s\n\n' % (child.id, child.name))
# print(child.metadata)
# print()
child.extract_ngrams(['title', 'abstract'], cache)
# child.extract_ngrams({'title':1., 'abstract':.2}, cache)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment