Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
49286b9a
Commit
49286b9a
authored
Feb 27, 2015
by
Mathieu Rodic
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[OPTI] Started optimizing parsing
https://forge.iscpif.fr/issues/1438
parent
76c1a3dd
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
57 additions
and
38 deletions
+57
-38
db.py
gargantext_web/db.py
+1
-1
views_optimized.py
gargantext_web/views_optimized.py
+8
-5
corpus.py
parsing/corpus.py
+48
-32
No files found.
gargantext_web/db.py
View file @
49286b9a
from
node
import
models
from
gargantext_web
import
settings
from
gargantext_web
import
settings
from
node
import
models
__all__
=
[
'literalquery'
,
'session'
,
'cache'
,
'Session'
]
__all__
=
[
'literalquery'
,
'session'
,
'cache'
,
'Session'
]
...
...
gargantext_web/views_optimized.py
View file @
49286b9a
...
@@ -12,6 +12,7 @@ from node.admin import CustomForm
...
@@ -12,6 +12,7 @@ from node.admin import CustomForm
from
gargantext_web.db
import
*
from
gargantext_web.db
import
*
from
gargantext_web.settings
import
DEBUG
from
gargantext_web.settings
import
DEBUG
from
parsing.corpus
import
parse_resources
def
project
(
request
,
project_id
):
def
project
(
request
,
project_id
):
...
@@ -49,6 +50,7 @@ def project(request, project_id):
...
@@ -49,6 +50,7 @@ def project(request, project_id):
.
join
(
Resource
,
Resource
.
id
==
Node_Resource
.
resource_id
)
.
join
(
Resource
,
Resource
.
id
==
Node_Resource
.
resource_id
)
.
filter
(
Node
.
parent_id
==
project
.
id
)
.
filter
(
Node
.
parent_id
==
project
.
id
)
.
group_by
(
Node
,
Resource
)
.
group_by
(
Node
,
Resource
)
.
order_by
(
Node
.
name
)
)
)
corpora_by_resourcetype
=
defaultdict
(
list
)
corpora_by_resourcetype
=
defaultdict
(
list
)
documents_count_by_resourcetype
=
defaultdict
(
int
)
documents_count_by_resourcetype
=
defaultdict
(
int
)
...
@@ -69,7 +71,7 @@ def project(request, project_id):
...
@@ -69,7 +71,7 @@ def project(request, project_id):
donut
=
[
donut
=
[
{
'source'
:
key
,
{
'source'
:
key
,
'count'
:
value
,
'count'
:
value
,
'part'
:
round
(
value
*
100
/
total_documents_count
),
'part'
:
round
(
value
*
100
/
total_documents_count
)
if
total_documents_count
else
0
,
}
}
for
key
,
value
in
documents_count_by_resourcetype
.
items
()
for
key
,
value
in
documents_count_by_resourcetype
.
items
()
]
]
...
@@ -108,10 +110,11 @@ def project(request, project_id):
...
@@ -108,10 +110,11 @@ def project(request, project_id):
)
)
# let's start the workflow
# let's start the workflow
try
:
try
:
if
DEBUG
is
True
:
parse_resources
(
dj_corpus
,
user_id
=
request
.
user
.
id
)
dj_corpus
.
workflow
()
# if DEBUG is True:
else
:
# dj_corpus.workflow()
dj_corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
# else:
# dj_corpus.workflow.apply_async((), countdown=3)
except
Exception
as
error
:
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
'WORKFLOW ERROR'
)
print
(
error
)
print
(
error
)
...
...
parsing/corpus.py
View file @
49286b9a
from
collections
import
defaultdict
from
collections
import
defaultdict
from
datetime
import
datetime
from
gargantext_web.db
import
*
from
gargantext_web.db
import
*
from
.FileParsers
import
*
from
.FileParsers
import
*
_parsers
=
{
# keep all the parsers in a cache
class
Parsers
(
defaultdict
):
_parsers
=
{
'pubmed'
:
PubmedFileParser
,
'pubmed'
:
PubmedFileParser
,
'isi'
:
IsiFileParser
,
'isi'
:
IsiFileParser
,
'ris'
:
RisFileParser
,
'ris'
:
RisFileParser
,
'europress'
:
EuropressFileParser
,
'europress'
:
EuropressFileParser
,
'europress_french'
:
EuropressFileParser
,
'europress_french'
:
EuropressFileParser
,
'europress_english'
:
EuropressFileParser
,
'europress_english'
:
EuropressFileParser
,
}
}
def
__missing__
(
self
,
key
):
if
key
not
in
self
.
_parsers
:
raise
NotImplementedError
(
'No such parser: "
%
s"'
%
(
key
))
parser
=
self
.
_parsers
[
key
]()
self
[
key
]
=
parser
return
parser
parsers
=
Parsers
()
def
parse_corpus_resources
(
corpus
,
user
=
None
,
user_id
=
None
):
def
parse_resources
(
corpus
,
user
=
None
,
user_id
=
None
):
session
=
Session
()
session
=
Session
()
type_id
=
cache
.
NodeType
[
'Document'
]
corpus_id
=
corpus
.
id
type_id
=
cache
.
NodeType
[
'Document'
]
.
id
if
user_id
is
None
and
user
is
not
None
:
if
user_id
is
None
and
user
is
not
None
:
user_id
=
user
.
id
user_id
=
user
.
id
# keep all the parsers in a cache
parsers
=
defaultdict
(
lambda
key
:
_parsers
[
key
]())
# find resource of the corpus
# find resource of the corpus
resources_query
=
(
session
resources_query
=
(
session
.
query
(
Resource
,
ResourceType
)
.
query
(
Resource
,
ResourceType
)
.
join
(
ResourceType
,
ResourceType
.
id
==
Resource
.
type_id
)
.
join
(
ResourceType
,
ResourceType
.
id
==
Resource
.
type_id
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
.
id
)
.
join
(
Node
,
Node
.
id
==
Node_Resource
.
node_
id
)
.
filter
(
Node_Resource
.
node_id
==
corpus
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
_Resource
.
parsed
==
False
)
)
)
# make a new node for every parsed document of the corpus
# make a new node for every parsed document of the corpus
nodes
=
list
()
nodes
=
list
()
for
resource
,
resourcetype
in
resources_query
:
for
resource
,
resourcetype
in
resources_query
:
parser
=
parsers
[
resourcetype
.
name
]
parser
=
parsers
[
resourcetype
.
name
]
for
metadata_dict
in
resource
:
for
metadata_dict
in
parser
.
parse
(
resource
.
file
)
:
# retrieve language ID from metadata
# retrieve language ID from metadata
if
'language_iso2'
in
metadata_dict
:
if
'language_iso2'
in
metadata_dict
:
try
:
try
:
language_id
=
cache
.
Lang
age
[
metadata_dict
[
'language_iso2'
]]
language_id
=
cache
.
Lang
uage
[
metadata_dict
[
'language_iso2'
]]
.
id
except
KeyError
:
except
KeyError
:
language_id
=
None
language_id
=
None
else
:
else
:
language_id
=
None
language_id
=
None
# create new node
# create new node
node
=
Node
(
node
=
Node
(
)
name
=
metadata
.
get
(
'title'
,
''
),
node
.
name
=
metadata_dict
.
get
(
'title'
,
''
)
parent_id
=
corpus
.
id
,
node
.
parent_id
=
corpus_id
user_id
=
user_id
,
node
.
user_id
=
user_id
type_id
=
type_id
,
node
.
type_id
=
type_id
language_id
=
language_id
,
node
.
language_id
=
language_id
metadata
=
metadata_dict
,
node
.
metadata
=
metadata_dict
)
node
.
date
=
datetime
.
utcnow
(
)
nodes
.
append
(
node
)
nodes
.
append
(
node
)
session
.
add_bulk
(
nodes
)
#
# TODO: mark node-resources associations as parsed
#
session
.
add_all
(
nodes
)
session
.
commit
()
session
.
commit
()
# now, index the metadata
# now, index the metadata
for
node
in
nodes
:
for
node
in
nodes
:
...
@@ -62,18 +79,17 @@ def parse_corpus_resources(corpus, user=None, user_id=None):
...
@@ -62,18 +79,17 @@ def parse_corpus_resources(corpus, user=None, user_id=None):
metadata
=
cache
.
Metadata
[
key
]
metadata
=
cache
.
Metadata
[
key
]
if
metadata
.
type
==
'string'
:
if
metadata
.
type
==
'string'
:
metadata_value
=
metadata_value
[:
255
]
metadata_value
=
metadata_value
[:
255
]
node_metadata
=
Node_Metadata
(
**
{
node_metadata
=
Node_Metadata
()
'node_id'
:
node_id
,
node_metadata
.
node_id
=
node_id
'metadata_id'
:
metadata
.
id
,
node_metadata
.
metadata_id
=
metadata
.
id
'value_'
+
metadata
.
type
:
value
,
setattr
(
node_metadata
,
'value_'
+
metadata
.
type
,
metadata_value
)
})
session
.
add
(
node_metadata
)
session
.
add
(
node_metadata
)
session
.
commit
()
session
.
commit
()
# mark the corpus as parsed
# mark the corpus as parsed
corpus
.
parsed
=
True
corpus
.
parsed
=
True
def
parse_corpu
s
(
corpus
):
def
extract_ngram
s
(
corpus
):
# prepare the cache for ngrams
# prepare the cache for ngrams
from
nodes
import
models
from
nodes
import
models
ngrams
=
ModelCache
(
models
.
Node
)
ngrams
=
ModelCache
(
models
.
Node
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment