Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
49286b9a
Commit
49286b9a
authored
Feb 27, 2015
by
Mathieu Rodic
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[OPTI] Started optimizing parsing
https://forge.iscpif.fr/issues/1438
parent
76c1a3dd
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
57 additions
and
38 deletions
+57
-38
db.py
gargantext_web/db.py
+1
-1
views_optimized.py
gargantext_web/views_optimized.py
+8
-5
corpus.py
parsing/corpus.py
+48
-32
No files found.
gargantext_web/db.py
View file @
49286b9a
from
node
import
models
from
gargantext_web
import
settings
from
node
import
models
__all__
=
[
'literalquery'
,
'session'
,
'cache'
,
'Session'
]
...
...
gargantext_web/views_optimized.py
View file @
49286b9a
...
...
@@ -12,6 +12,7 @@ from node.admin import CustomForm
from
gargantext_web.db
import
*
from
gargantext_web.settings
import
DEBUG
from
parsing.corpus
import
parse_resources
def
project
(
request
,
project_id
):
...
...
@@ -49,6 +50,7 @@ def project(request, project_id):
.
join
(
Resource
,
Resource
.
id
==
Node_Resource
.
resource_id
)
.
filter
(
Node
.
parent_id
==
project
.
id
)
.
group_by
(
Node
,
Resource
)
.
order_by
(
Node
.
name
)
)
corpora_by_resourcetype
=
defaultdict
(
list
)
documents_count_by_resourcetype
=
defaultdict
(
int
)
...
...
@@ -69,7 +71,7 @@ def project(request, project_id):
donut
=
[
{
'source'
:
key
,
'count'
:
value
,
'part'
:
round
(
value
*
100
/
total_documents_count
),
'part'
:
round
(
value
*
100
/
total_documents_count
)
if
total_documents_count
else
0
,
}
for
key
,
value
in
documents_count_by_resourcetype
.
items
()
]
...
...
@@ -108,10 +110,11 @@ def project(request, project_id):
)
# let's start the workflow
try
:
if
DEBUG
is
True
:
dj_corpus
.
workflow
()
else
:
dj_corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
parse_resources
(
dj_corpus
,
user_id
=
request
.
user
.
id
)
# if DEBUG is True:
# dj_corpus.workflow()
# else:
# dj_corpus.workflow.apply_async((), countdown=3)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
...
...
parsing/corpus.py
View file @
49286b9a
from
collections
import
defaultdict
from
datetime
import
datetime
from
gargantext_web.db
import
*
from
.FileParsers
import
*
_parsers
=
{
'pubmed'
:
PubmedFileParser
,
'isi'
:
IsiFileParser
,
'ris'
:
RisFileParser
,
'europress'
:
EuropressFileParser
,
'europress_french'
:
EuropressFileParser
,
'europress_english'
:
EuropressFileParser
,
}
# keep all the parsers in a cache
class
Parsers
(
defaultdict
):
def
parse_corpus_resources
(
corpus
,
user
=
None
,
user_id
=
None
):
_parsers
=
{
'pubmed'
:
PubmedFileParser
,
'isi'
:
IsiFileParser
,
'ris'
:
RisFileParser
,
'europress'
:
EuropressFileParser
,
'europress_french'
:
EuropressFileParser
,
'europress_english'
:
EuropressFileParser
,
}
def
__missing__
(
self
,
key
):
if
key
not
in
self
.
_parsers
:
raise
NotImplementedError
(
'No such parser: "
%
s"'
%
(
key
))
parser
=
self
.
_parsers
[
key
]()
self
[
key
]
=
parser
return
parser
parsers
=
Parsers
()
def
parse_resources
(
corpus
,
user
=
None
,
user_id
=
None
):
session
=
Session
()
type_id
=
cache
.
NodeType
[
'Document'
]
corpus_id
=
corpus
.
id
type_id
=
cache
.
NodeType
[
'Document'
]
.
id
if
user_id
is
None
and
user
is
not
None
:
user_id
=
user
.
id
# keep all the parsers in a cache
parsers
=
defaultdict
(
lambda
key
:
_parsers
[
key
]())
# find resource of the corpus
resources_query
=
(
session
.
query
(
Resource
,
ResourceType
)
.
join
(
ResourceType
,
ResourceType
.
id
==
Resource
.
type_id
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
)
.
join
(
Node
,
Node
.
id
==
Node_Resource
.
node_
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
.
id
)
.
filter
(
Node_Resource
.
node_id
==
corpus
.
id
)
.
filter
(
Node
_Resource
.
parsed
==
False
)
)
# make a new node for every parsed document of the corpus
nodes
=
list
()
for
resource
,
resourcetype
in
resources_query
:
parser
=
parsers
[
resourcetype
.
name
]
for
metadata_dict
in
resource
:
for
metadata_dict
in
parser
.
parse
(
resource
.
file
)
:
# retrieve language ID from metadata
if
'language_iso2'
in
metadata_dict
:
try
:
language_id
=
cache
.
Lang
age
[
metadata_dict
[
'language_iso2'
]]
language_id
=
cache
.
Lang
uage
[
metadata_dict
[
'language_iso2'
]]
.
id
except
KeyError
:
language_id
=
None
else
:
language_id
=
None
# create new node
node
=
Node
(
name
=
metadata
.
get
(
'title'
,
''
),
parent_id
=
corpus
.
id
,
user_id
=
user_id
,
type_id
=
type_id
,
language_id
=
language_id
,
metadata
=
metadata_dict
,
)
node
=
Node
(
)
node
.
name
=
metadata_dict
.
get
(
'title'
,
''
)
node
.
parent_id
=
corpus_id
node
.
user_id
=
user_id
node
.
type_id
=
type_id
node
.
language_id
=
language_id
node
.
metadata
=
metadata_dict
node
.
date
=
datetime
.
utcnow
(
)
nodes
.
append
(
node
)
session
.
add_bulk
(
nodes
)
#
# TODO: mark node-resources associations as parsed
#
session
.
add_all
(
nodes
)
session
.
commit
()
# now, index the metadata
for
node
in
nodes
:
...
...
@@ -62,18 +79,17 @@ def parse_corpus_resources(corpus, user=None, user_id=None):
metadata
=
cache
.
Metadata
[
key
]
if
metadata
.
type
==
'string'
:
metadata_value
=
metadata_value
[:
255
]
node_metadata
=
Node_Metadata
(
**
{
'node_id'
:
node_id
,
'metadata_id'
:
metadata
.
id
,
'value_'
+
metadata
.
type
:
value
,
})
node_metadata
=
Node_Metadata
()
node_metadata
.
node_id
=
node_id
node_metadata
.
metadata_id
=
metadata
.
id
setattr
(
node_metadata
,
'value_'
+
metadata
.
type
,
metadata_value
)
session
.
add
(
node_metadata
)
session
.
commit
()
# mark the corpus as parsed
corpus
.
parsed
=
True
def
parse_corpu
s
(
corpus
):
def
extract_ngram
s
(
corpus
):
# prepare the cache for ngrams
from
nodes
import
models
ngrams
=
ModelCache
(
models
.
Node
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment