Commit 49286b9a authored by Mathieu Rodic's avatar Mathieu Rodic

[OPTI] Started optimizing parsing

https://forge.iscpif.fr/issues/1438
parent 76c1a3dd
from node import models
from gargantext_web import settings from gargantext_web import settings
from node import models
__all__ = ['literalquery', 'session', 'cache', 'Session'] __all__ = ['literalquery', 'session', 'cache', 'Session']
......
...@@ -12,6 +12,7 @@ from node.admin import CustomForm ...@@ -12,6 +12,7 @@ from node.admin import CustomForm
from gargantext_web.db import * from gargantext_web.db import *
from gargantext_web.settings import DEBUG from gargantext_web.settings import DEBUG
from parsing.corpus import parse_resources
def project(request, project_id): def project(request, project_id):
...@@ -49,6 +50,7 @@ def project(request, project_id): ...@@ -49,6 +50,7 @@ def project(request, project_id):
.join(Resource, Resource.id == Node_Resource.resource_id) .join(Resource, Resource.id == Node_Resource.resource_id)
.filter(Node.parent_id == project.id) .filter(Node.parent_id == project.id)
.group_by(Node, Resource) .group_by(Node, Resource)
.order_by(Node.name)
) )
corpora_by_resourcetype = defaultdict(list) corpora_by_resourcetype = defaultdict(list)
documents_count_by_resourcetype = defaultdict(int) documents_count_by_resourcetype = defaultdict(int)
...@@ -69,7 +71,7 @@ def project(request, project_id): ...@@ -69,7 +71,7 @@ def project(request, project_id):
donut = [ donut = [
{ 'source': key, { 'source': key,
'count': value, 'count': value,
'part' : round(value * 100 / total_documents_count), 'part' : round(value * 100 / total_documents_count) if total_documents_count else 0,
} }
for key, value in documents_count_by_resourcetype.items() for key, value in documents_count_by_resourcetype.items()
] ]
...@@ -108,10 +110,11 @@ def project(request, project_id): ...@@ -108,10 +110,11 @@ def project(request, project_id):
) )
# let's start the workflow # let's start the workflow
try: try:
if DEBUG is True: parse_resources(dj_corpus, user_id=request.user.id)
dj_corpus.workflow() # if DEBUG is True:
else: # dj_corpus.workflow()
dj_corpus.workflow.apply_async((), countdown=3) # else:
# dj_corpus.workflow.apply_async((), countdown=3)
except Exception as error: except Exception as error:
print('WORKFLOW ERROR') print('WORKFLOW ERROR')
print(error) print(error)
......
from collections import defaultdict from collections import defaultdict
from datetime import datetime
from gargantext_web.db import * from gargantext_web.db import *
from .FileParsers import * from .FileParsers import *
_parsers = {
# keep all the parsers in a cache
class Parsers(defaultdict):
_parsers = {
'pubmed' : PubmedFileParser, 'pubmed' : PubmedFileParser,
'isi' : IsiFileParser, 'isi' : IsiFileParser,
'ris' : RisFileParser, 'ris' : RisFileParser,
'europress' : EuropressFileParser, 'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser, 'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser, 'europress_english' : EuropressFileParser,
} }
def __missing__(self, key):
if key not in self._parsers:
raise NotImplementedError('No such parser: "%s"' % (key))
parser = self._parsers[key]()
self[key] = parser
return parser
parsers = Parsers()
def parse_corpus_resources(corpus, user=None, user_id=None):
def parse_resources(corpus, user=None, user_id=None):
session = Session() session = Session()
type_id = cache.NodeType['Document'] corpus_id = corpus.id
type_id = cache.NodeType['Document'].id
if user_id is None and user is not None: if user_id is None and user is not None:
user_id = user.id user_id = user.id
# keep all the parsers in a cache
parsers = defaultdict(lambda key: _parsers[key]())
# find resource of the corpus # find resource of the corpus
resources_query = (session resources_query = (session
.query(Resource, ResourceType) .query(Resource, ResourceType)
.join(ResourceType, ResourceType.id == Resource.type_id) .join(ResourceType, ResourceType.id == Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource) .join(Node_Resource, Node_Resource.resource_id == Resource.id)
.join(Node, Node.id == Node_Resource.node_id) .filter(Node_Resource.node_id == corpus.id)
.filter(Node.parent_id == corpus.id) .filter(Node_Resource.parsed == False)
) )
# make a new node for every parsed document of the corpus # make a new node for every parsed document of the corpus
nodes = list() nodes = list()
for resource, resourcetype in resources_query: for resource, resourcetype in resources_query:
parser = parsers[resourcetype.name] parser = parsers[resourcetype.name]
for metadata_dict in resource: for metadata_dict in parser.parse(resource.file):
# retrieve language ID from metadata # retrieve language ID from metadata
if 'language_iso2' in metadata_dict: if 'language_iso2' in metadata_dict:
try: try:
language_id = cache.Langage[metadata_dict['language_iso2']] language_id = cache.Language[metadata_dict['language_iso2']].id
except KeyError: except KeyError:
language_id = None language_id = None
else: else:
language_id = None language_id = None
# create new node # create new node
node = Node( node = Node()
name = metadata.get('title', ''), node.name = metadata_dict.get('title', '')
parent_id = corpus.id, node.parent_id = corpus_id
user_id = user_id, node.user_id = user_id
type_id = type_id, node.type_id = type_id
language_id = language_id, node.language_id = language_id
metadata = metadata_dict, node.metadata = metadata_dict
) node.date = datetime.utcnow()
nodes.append(node) nodes.append(node)
session.add_bulk(nodes) #
# TODO: mark node-resources associations as parsed
#
session.add_all(nodes)
session.commit() session.commit()
# now, index the metadata # now, index the metadata
for node in nodes: for node in nodes:
...@@ -62,18 +79,17 @@ def parse_corpus_resources(corpus, user=None, user_id=None): ...@@ -62,18 +79,17 @@ def parse_corpus_resources(corpus, user=None, user_id=None):
metadata = cache.Metadata[key] metadata = cache.Metadata[key]
if metadata.type == 'string': if metadata.type == 'string':
metadata_value = metadata_value[:255] metadata_value = metadata_value[:255]
node_metadata = Node_Metadata(**{ node_metadata = Node_Metadata()
'node_id': node_id, node_metadata.node_id = node_id
'metadata_id': metadata.id, node_metadata.metadata_id = metadata.id
'value_'+metadata.type: value, setattr(node_metadata, 'value_'+metadata.type, metadata_value)
})
session.add(node_metadata) session.add(node_metadata)
session.commit() session.commit()
# mark the corpus as parsed # mark the corpus as parsed
corpus.parsed = True corpus.parsed = True
def parse_corpus(corpus): def extract_ngrams(corpus):
# prepare the cache for ngrams # prepare the cache for ngrams
from nodes import models from nodes import models
ngrams = ModelCache(models.Node) ngrams = ModelCache(models.Node)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment