Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
e7ac6426
Commit
e7ac6426
authored
Aug 30, 2017
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'gargantext.org/simon-unstable-notebook' into unstable-merge
parents
dae0243d
30c1dbdc
Changes
12
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
305 additions
and
353 deletions
+305
-353
base.py
gargantext/models/base.py
+21
-1
nodes.py
gargantext/models/nodes.py
+6
-2
http.py
gargantext/util/http.py
+2
-1
gargantext_notebook.py
install/notebook/gargantext_notebook.py
+131
-73
cern.py
moissonneurs/cern.py
+4
-1
hal.py
moissonneurs/hal.py
+4
-1
isidore.py
moissonneurs/isidore.py
+4
-1
istex.py
moissonneurs/istex.py
+4
-1
multivac.py
moissonneurs/multivac.py
+4
-1
pubmed.py
moissonneurs/pubmed.py
+5
-1
AdvancedTutorial.ipynb
notebooks/AdvancedTutorial.ipynb
+96
-245
project.html
templates/pages/projects/project.html
+24
-25
No files found.
gargantext/models/base.py
View file @
e7ac6426
from
sqlalchemy.schema
import
Column
,
ForeignKey
,
UniqueConstraint
,
Index
from
sqlalchemy.orm
import
relationship
from
sqlalchemy.orm
import
relationship
,
validates
from
sqlalchemy.types
import
TypeDecorator
,
\
Integer
,
Float
,
Boolean
,
DateTime
,
String
,
Text
from
sqlalchemy.dialects.postgresql
import
JSONB
,
DOUBLE_PRECISION
as
Double
...
...
@@ -7,6 +7,7 @@ from sqlalchemy.ext.mutable import MutableDict, MutableList
from
sqlalchemy.ext.declarative
import
declarative_base
__all__
=
[
"Column"
,
"ForeignKey"
,
"UniqueConstraint"
,
"relationship"
,
"validates"
,
"ValidatorMixin"
,
"Integer"
,
"Float"
,
"Boolean"
,
"DateTime"
,
"String"
,
"Text"
,
"TypeDecorator"
,
"JSONB"
,
"Double"
,
...
...
@@ -18,6 +19,25 @@ __all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
# all tables handled by Alembic migration scripts.
Base
=
declarative_base
()
# To be used by tables already handled by Django ORM, such as User model. We
# separate them in order to keep those out of Alembic sight.
DjangoBase
=
declarative_base
()
class
ValidatorMixin
(
object
):
def
enforce_length
(
self
,
key
,
value
):
"""Truncate a string according to its column length
Usage example:
.. code-block:: python
@validates('some_column')
def validate_some_column(self, key, value):
self.enforce_length(key, value)
"""
max_len
=
getattr
(
self
.
__class__
,
key
)
.
prop
.
columns
[
0
]
.
type
.
length
if
value
and
len
(
value
)
>
max_len
:
return
value
[:
max_len
]
return
value
gargantext/models/nodes.py
View file @
e7ac6426
...
...
@@ -9,7 +9,7 @@ from datetime import datetime
from
.base
import
Base
,
Column
,
ForeignKey
,
relationship
,
TypeDecorator
,
Index
,
\
Integer
,
Float
,
String
,
DateTime
,
JSONB
,
\
MutableList
,
MutableDict
MutableList
,
MutableDict
,
validates
,
ValidatorMixin
from
.users
import
User
__all__
=
[
'Node'
,
'NodeNode'
,
'CorpusNode'
]
...
...
@@ -26,7 +26,7 @@ class NodeType(TypeDecorator):
return
NODETYPES
[
typeindex
]
class
Node
(
Base
):
class
Node
(
ValidatorMixin
,
Base
):
"""This model can fit many purposes:
myFirstCorpus = session.query(CorpusNode).first()
...
...
@@ -112,6 +112,10 @@ class Node(Base):
'user_id={0.user_id}, parent_id={0.parent_id}, '
\
'name={0.name!r}, date={0.date})>'
.
format
(
self
)
@
validates
(
'name'
)
def
validate_name
(
self
,
key
,
value
):
return
self
.
enforce_length
(
key
,
value
)
@
property
def
ngrams
(
self
):
"""Pseudo-attribute allowing to retrieve a node's ngrams.
...
...
gargantext/util/http.py
View file @
e7ac6426
...
...
@@ -73,7 +73,8 @@ from rest_framework.views import APIView
from
gargantext.util.json
import
json_encoder
def
JsonHttpResponse
(
data
,
status
=
200
):
return
HttpResponse
(
content
=
json_encoder
.
encode
(
data
),
content
=
data
.
encode
(
'utf-8'
)
if
isinstance
(
data
,
str
)
else
\
json_encoder
.
encode
(
data
),
content_type
=
'application/json; charset=utf-8'
,
status
=
status
)
...
...
install/notebook/gargantext_notebook.py
View file @
e7ac6426
#!/usr/bin/env python
"""
Gargantext Software Copyright (c) 2016-2017 CNRS ISC-PIF -
http://iscpif.fr
...
...
@@ -6,45 +7,29 @@ http://gitlab.iscpif.fr/humanities/gargantext/blob/stable/LICENSE )
- In France : a CECILL variant affero compliant
- GNU aGPLV3 for all other countries
"""
#!/usr/bin/env python
import
sys
import
os
import
os
import
django
# Django settings
dirname
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
os
.
environ
.
setdefault
(
"DJANGO_SETTINGS_MODULE"
,
"gargantext.settings"
)
os
.
environ
.
setdefault
(
'DJANGO_SETTINGS_MODULE'
,
'gargantext.settings'
)
django
.
setup
()
# initialize Django application
from
django.core.wsgi
import
get_wsgi_application
application
=
get_wsgi_application
()
from
gargantext.constants
import
QUERY_SIZE_N_MAX
,
get_resource
,
get_resource_by_name
from
gargantext.models
import
ProjectNode
,
DocumentNode
,
UserNode
,
User
from
gargantext.util.db
import
session
,
get_engine
from
collections
import
Counter
import
importlib
from
django.http
import
Http404
from
gargantext.util.toolchain.main
import
parse_extract_indexhyperdata
from
gargantext.util.db
import
*
from
gargantext.models
import
Node
from
gargantext.util.toolchain.main
import
parse_extract_indexhyperdata
from
nltk.tokenize
import
wordpunct_tokenize
from
gargantext.models
import
*
from
nltk.tokenize
import
word_tokenize
import
nltk
as
nltk
from
statistics
import
mean
from
math
import
log
from
collections
import
defaultdict
import
matplotlib.pyplot
as
plt
import
numpy
as
np
import
datetime
class
NotebookError
(
Exception
):
pass
from
collections
import
Counter
from
langdetect
import
detect
as
detect_lang
def
documents
(
corpus_id
):
return
(
session
.
query
(
Node
)
.
filter
(
Node
.
parent_id
==
corpus_id
,
Node
.
typename
==
"DOCUMENT"
)
# .order_by(Node.hyperdata['publication_date'])
.
all
()
)
return
(
session
.
query
(
DocumentNode
)
.
filter_by
(
parent_id
=
corpus_id
)
#.order_by(Node.hyperdata['publication_date'])
.
all
())
#import seaborn as sns
...
...
@@ -56,18 +41,21 @@ def chart(docs, field):
frame1
=
pd
.
DataFrame
(
year_publis
,
columns
=
[
'Date'
,
'DateValue'
],
index
=
frame0
.
Date
)
return
frame1
from
gargantext.util.crawlers.HAL
import
HalCrawler
def
scan_hal
(
request
):
hal
=
HalCrawler
()
return
hal
.
scan_results
(
request
)
def
scan_gargantext
(
corpus_id
,
lang
,
request
):
connection
=
get_engine
()
.
connect
()
# TODO add some sugar the request (ideally request should be the same for hal and garg)
query
=
"""select count(n.id) from nodes n
where to_tsvector('
%
s', hyperdata ->> 'abstract' || 'title')
where to_tsvector('
%
s', hyperdata ->> 'abstract' || 'title')
@@ to_tsquery('
%
s')
AND n.parent_id =
%
s;"""
%
(
lang
,
request
,
corpus_id
)
AND n.parent_id =
%
s;"""
%
(
lang
,
request
,
corpus_id
)
return
[
i
for
i
in
connection
.
execute
(
query
)][
0
][
0
]
connection
.
close
()
...
...
@@ -77,47 +65,117 @@ def myProject_fromUrl(url):
myProject :: String -> Project
"""
project_id
=
url
.
split
(
"/"
)[
4
]
project
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
first
(
)
project
=
session
.
query
(
ProjectNode
)
.
get
(
project_id
)
return
project
def
newCorpus
(
project
,
resourceName
=
11
,
name
=
"Machine learning"
,
query
=
"LSTM"
):
print
(
"Corpus
\"
%
s
\"
in project
\"
%
s
\"
created"
%
(
name
,
project
.
name
))
corpus
=
project
.
add_child
(
name
=
"Corpus name"
,
typename
=
'CORPUS'
)
corpus
.
hyperdata
[
"resources"
]
=
[{
"extracted"
:
"true"
,
"type"
:
11
}]
corpus
.
hyperdata
[
"statuses"
]
=
[{
"action"
:
"notebook"
,
"complete"
:
"true"
}]
# [TODO] Add informations needed to get buttons on the Project view.
session
.
add
(
corpus
)
session
.
commit
()
hal
=
HalCrawler
()
max_result
=
hal
.
scan_results
(
query
)
paging
=
100
for
page
in
range
(
0
,
max_result
,
paging
):
print
(
"
%
s documents downloaded /
%
s."
%
(
str
(
paging
*
(
page
+
1
)),
str
(
max_result
)
))
docs
=
(
hal
.
_get
(
query
,
fromPage
=
page
,
count
=
paging
)
.
get
(
"response"
,
{})
.
get
(
"docs"
,
[])
)
from
gargantext.util.parsers.HAL
import
HalParser
# [TODO] fix boilerplate for docs here
new_docs
=
HalParser
(
docs
)
.
_parse
(
docs
)
for
doc
in
new_docs
:
new_doc
=
(
corpus
.
add_child
(
name
=
doc
[
"title"
][:
255
]
,
typename
=
'DOCUMENT'
)
)
new_doc
[
"hyperdata"
]
=
doc
session
.
add
(
new_doc
)
session
.
commit
()
print
(
"Extracting the ngrams"
)
parse_extract_indexhyperdata
(
corpus
)
print
(
"Corpus is ready to explore:"
)
print
(
"http://imt.gargantext.org/projects/
%
s/corpora/
%
s/"
%
(
project
.
id
,
corpus
.
id
))
return
corpus
def
newCorpus
(
project
,
source
,
name
=
None
,
query
=
None
):
error
=
False
if
name
is
None
:
name
=
query
if
not
isinstance
(
project
,
ProjectNode
):
error
=
"a valid project"
if
not
isinstance
(
source
,
int
)
and
not
isinstance
(
source
,
str
):
error
=
"a valid source identifier: id or name"
elif
not
isinstance
(
query
,
str
):
error
=
"a valid query"
elif
not
isinstance
(
name
,
str
):
error
=
"a valid name"
if
error
:
raise
NotebookError
(
"Please provide
%
s."
%
error
)
resource
=
get_resource
(
source
)
if
isinstance
(
source
,
int
)
else
\
get_resource_by_name
(
source
)
moissonneur_name
=
get_moissonneur_name
(
resource
)
if
resource
else
\
source
.
lower
()
try
:
moissonneur
=
get_moissonneur
(
moissonneur_name
)
except
ImportError
:
raise
NotebookError
(
"Invalid source identifier:
%
r"
%
source
)
return
run_moissonneur
(
moissonneur
,
project
,
name
,
query
)
def
get_moissonneur_name
(
ident
):
""" Return moissonneur module name from RESOURCETYPE or crawler name """
# Does it quacks like a RESOURCETYPE ?
if
hasattr
(
ident
,
'get'
):
ident
=
ident
.
get
(
'crawler'
)
# Extract name from crawler class name, otherwise assume ident is already
# a moissonneur name.
if
isinstance
(
ident
,
str
)
and
ident
.
endswith
(
'Crawler'
):
return
ident
[:
-
len
(
'Crawler'
)]
.
lower
()
def
get_moissonneur
(
name
):
""" Return moissonneur module from its name """
if
not
isinstance
(
name
,
str
)
or
not
name
.
islower
():
raise
NotebookError
(
"Invalid moissonneur name:
%
r"
%
name
)
module
=
importlib
.
import_module
(
'moissonneurs.
%
s'
%
name
)
module
.
name
=
name
return
module
def
run_moissonneur
(
moissonneur
,
project
,
name
,
query
):
""" Run moissonneur and return resulting corpus """
# XXX Uber-kludge with gory details. Spaghetti rulezzzzz!
class
Dummy
(
object
):
pass
request
=
Dummy
()
request
.
method
=
'POST'
request
.
path
=
'nowhere'
request
.
META
=
{}
# XXX 'string' only have effect on moissonneurs.pubmed; its value is added
# when processing request client-side, take a deep breath and see
# templates/projects/project.html for more details.
request
.
POST
=
{
'string'
:
name
,
'query'
:
query
,
'N'
:
QUERY_SIZE_N_MAX
}
request
.
user
=
Dummy
()
request
.
user
.
id
=
project
.
user_id
request
.
user
.
is_authenticated
=
lambda
:
True
if
moissonneur
.
name
==
'istex'
:
# Replace ALL spaces by plus signs
request
.
POST
[
'query'
]
=
'+'
.
join
(
filter
(
None
,
query
.
split
(
' '
)))
try
:
import
json
r
=
moissonneur
.
query
(
request
)
raw_json
=
r
.
content
.
decode
(
'utf-8'
)
data
=
json
.
loads
(
raw_json
)
if
moissonneur
.
name
==
'pubmed'
:
count
=
sum
(
x
[
'count'
]
for
x
in
data
)
request
.
POST
[
'query'
]
=
raw_json
elif
moissonneur
.
name
==
'istex'
:
count
=
data
.
get
(
'total'
,
0
)
else
:
count
=
data
.
get
(
'results_nb'
,
0
)
if
count
>
0
:
corpus
=
moissonneur
.
save
(
request
,
project
.
id
,
return_corpus
=
True
)
else
:
return
None
except
(
ValueError
,
Http404
)
as
e
:
raise
e
# Sometimes strange things happens...
if
corpus
.
name
!=
name
:
corpus
.
name
=
name
session
.
commit
()
return
corpus
moissonneurs/cern.py
View file @
e7ac6426
...
...
@@ -30,7 +30,7 @@ def query( request):
#ids = crawlerbot.get_ids(query)
return
JsonHttpResponse
({
"results_nb"
:
crawlerbot
.
results_nb
})
def
save
(
request
,
project_id
):
def
save
(
request
,
project_id
,
return_corpus
=
False
):
'''save'''
if
request
.
method
==
"POST"
:
...
...
@@ -101,6 +101,9 @@ def save(request, project_id):
session
.
rollback
()
# --------------------------------------------
if
return_corpus
:
return
corpus
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
...
...
moissonneurs/hal.py
View file @
e7ac6426
...
...
@@ -33,7 +33,7 @@ def query( request):
print
(
results
)
return
JsonHttpResponse
({
"results_nb"
:
crawlerbot
.
results_nb
})
def
save
(
request
,
project_id
):
def
save
(
request
,
project_id
,
return_corpus
=
False
):
'''save'''
if
request
.
method
==
"POST"
:
...
...
@@ -103,6 +103,9 @@ def save(request, project_id):
session
.
rollback
()
# --------------------------------------------
if
return_corpus
:
return
corpus
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
...
...
moissonneurs/isidore.py
View file @
e7ac6426
...
...
@@ -29,7 +29,7 @@ def query( request):
#ids = crawlerbot.get_ids(query)
return
JsonHttpResponse
({
"results_nb"
:
crawlerbot
.
results_nb
})
def
save
(
request
,
project_id
):
def
save
(
request
,
project_id
,
return_corpus
=
False
):
'''save'''
if
request
.
method
==
"POST"
:
...
...
@@ -100,6 +100,9 @@ def save(request, project_id):
session
.
rollback
()
# --------------------------------------------
if
return_corpus
:
return
corpus
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
...
...
moissonneurs/istex.py
View file @
e7ac6426
...
...
@@ -52,7 +52,7 @@ def query( request ):
def
save
(
request
,
project_id
):
def
save
(
request
,
project_id
,
return_corpus
=
False
):
print
(
"testISTEX:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
...
...
@@ -171,6 +171,9 @@ def save(request , project_id):
session
.
rollback
()
# --------------------------------------------
if
return_corpus
:
return
corpus
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
...
...
moissonneurs/multivac.py
View file @
e7ac6426
...
...
@@ -33,7 +33,7 @@ def query( request):
print
(
results
)
return
JsonHttpResponse
({
"results_nb"
:
crawlerbot
.
results_nb
})
def
save
(
request
,
project_id
):
def
save
(
request
,
project_id
,
return_corpus
=
False
):
'''save'''
if
request
.
method
==
"POST"
:
...
...
@@ -104,6 +104,9 @@ def save(request, project_id):
session
.
rollback
()
# --------------------------------------------
if
return_corpus
:
return
corpus
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
...
...
moissonneurs/pubmed.py
View file @
e7ac6426
...
...
@@ -69,7 +69,7 @@ def query( request ):
return
JsonHttpResponse
(
data
)
def
save
(
request
,
project_id
)
:
def
save
(
request
,
project_id
,
return_corpus
=
False
)
:
# implicit global session
# do we have a valid project id?
try
:
...
...
@@ -164,6 +164,10 @@ def save( request , project_id ) :
session
.
rollback
()
# --------------------------------------------
sleep
(
1
)
if
return_corpus
:
return
corpus
return
HttpResponseRedirect
(
'/projects/'
+
str
(
project_id
))
data
=
alist
...
...
notebooks/AdvancedTutorial.ipynb
View file @
e7ac6426
This diff is collapsed.
Click to expand it.
templates/pages/projects/project.html
View file @
e7ac6426
...
...
@@ -57,7 +57,7 @@
<center
id=
"corpus"
class=
"help"
>
<a
data-toggle=
"modal"
href=
"#addcorpus"
>
<button
type=
"button"
...
...
@@ -532,7 +532,7 @@
$
(
"#submit_thing"
).
html
(
"Process a {{ query_size }} sample!"
)
thequeries
=
data
var
N
=
0
,
k
=
0
;
var
N
=
0
;
for
(
var
i
in
thequeries
)
N
+=
thequeries
[
i
].
count
if
(
N
>
0
)
{
...
...
@@ -571,12 +571,11 @@
$
(
"#submit_thing"
).
html
(
"Process a {{ query_size }} sample!"
)
thequeries
=
data
var
N
=
data
.
length
,
k
=
0
;
// for(var i in thequeries) N += thequeries[i].count
if
(
N
>
1
)
{
var
total
=
JSON
.
parse
(
data
).
total
console
.
log
(
"N: "
+
total
)
$
(
"#theresults"
).
html
(
"<i> <b>"
+
pubmedquery
+
"</b>: "
+
total
+
" publications.</i><br>"
)
var
N
=
data
.
total
;
if
(
N
>
0
)
{
console
.
log
(
"N: "
+
N
)
$
(
"#theresults"
).
html
(
"<i> <b>"
+
pubmedquery
+
"</b>: "
+
N
+
" publications.</i><br>"
)
$
(
'#submit_thing'
).
prop
(
'disabled'
,
false
);
}
else
{
$
(
"#theresults"
).
html
(
"<i> <b>"
+
data
[
0
]
+
"</b></i><br>"
)
...
...
@@ -661,7 +660,7 @@
console
.
log
(
data
)
console
.
log
(
"SUCCESS"
)
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$
(
"#submit_thing"
).
prop
(
'disabled'
,
false
)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
...
...
@@ -721,7 +720,7 @@
console
.
log
(
data
)
console
.
log
(
"SUCCESS"
)
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$
(
"#submit_thing"
).
prop
(
'disabled'
,
false
)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
...
...
@@ -781,7 +780,7 @@
console
.
log
(
data
)
console
.
log
(
"SUCCESS"
)
console
.
log
(
"enabling "
+
"#"
+
value
.
id
)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$
(
"#submit_thing"
).
prop
(
'disabled'
,
false
)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
...
...
@@ -876,12 +875,12 @@
console
.
log
(
"selected:"
,
selectedId
);
// by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
if
(
selectedId
==
"3"
||
selectedId
==
"8"
||
selectedId
==
"9"
||
selectedId
==
"10"
||
selectedId
==
"11"
||
selectedId
==
"12"
if
(
selectedId
==
"3"
||
selectedId
==
"8"
||
selectedId
==
"9"
||
selectedId
==
"10"
||
selectedId
==
"11"
||
selectedId
==
"12"
)
{
console
.
log
(
"show the button for: "
+
selectedId
)
$
(
"#div-fileornot"
).
css
(
"visibility"
,
"visible"
);
...
...
@@ -1019,16 +1018,16 @@
function
saveMultivac
(
query
,
N
){
console
.
log
(
"In Multivac"
)
if
(
!
query
||
query
==
""
)
return
;
console
.
log
(
query
)
//var origQuery = query
var
data
=
{
"query"
:
query
,
"N"
:
N
};
// Replace all the slashes
var
projectid
=
window
.
location
.
href
.
split
(
"projects"
)[
1
].
replace
(
/
\/
/g
,
''
)
console
.
log
(
data
)
$
.
ajax
({
dataType
:
'json'
,
...
...
@@ -1066,16 +1065,16 @@
function
save
(
query
,
N
,
urlGarg
){
console
.
log
(
"In Gargantext"
)
if
(
!
query
||
query
==
""
)
return
;
console
.
log
(
query
)
//var origQuery = query
var
data
=
{
"query"
:
query
,
"N"
:
N
};
// Replace all the slashes
var
projectid
=
window
.
location
.
href
.
split
(
"projects"
)[
1
].
replace
(
/
\/
/g
,
''
)
console
.
log
(
data
)
$
.
ajax
({
dataType
:
'json'
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment