Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
0ab4f20a
Commit
0ab4f20a
authored
Apr 14, 2017
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
[MERGE] Testing 2 stable.
parents
48c7e541
499a52e7
Changes
32
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
32 changed files
with
1269 additions
and
143 deletions
+1269
-143
CHANGELOG.md
CHANGELOG.md
+6
-0
constants.py
gargantext/constants.py
+25
-10
urls.py
gargantext/urls.py
+8
-7
CERN.py
gargantext/util/crawlers/CERN.py
+3
-1
HAL.py
gargantext/util/crawlers/HAL.py
+135
-0
ISTEX.py
gargantext/util/crawlers/ISTEX.py
+8
-0
MULTIVAC.py
gargantext/util/crawlers/MULTIVAC.py
+118
-0
_Crawler.py
gargantext/util/crawlers/_Crawler.py
+17
-13
db.py
gargantext/util/db.py
+3
-0
credits.py
gargantext/util/generators/credits.py
+30
-24
HAL.py
gargantext/util/parsers/HAL.py
+75
-0
ISTEX.py
gargantext/util/parsers/ISTEX.py
+10
-9
MULTIVAC.py
gargantext/util/parsers/MULTIVAC.py
+82
-0
PUBMED.py
gargantext/util/parsers/PUBMED.py
+1
-1
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+7
-5
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+14
-10
main.py
gargantext/views/pages/main.py
+2
-1
graph.py
graph/graph.py
+18
-9
growth.py
graph/growth.py
+61
-0
utils.py
graph/utils.py
+2
-0
Debian.sh
install/gargamelle/Debian.sh
+10
-13
psqlFunctions.sql
install/gargamelle/psqlFunctions.sql
+71
-0
psql_configure.sh
install/gargamelle/psql_configure.sh
+5
-5
hal.py
moissonneurs/hal.py
+119
-0
multivac.py
moissonneurs/multivac.py
+120
-0
urls.py
moissonneurs/urls.py
+24
-17
about.html
templates/pages/main/about.html
+46
-0
menu.html
templates/pages/menu.html
+1
-1
modals.tpl
templates/pages/projects/modals.tpl
+3
-3
moissonneurs.js
templates/pages/projects/moissonneurs.js
+5
-3
project.html
templates/pages/projects/project.html
+237
-8
wait.html
templates/pages/projects/wait.html
+3
-3
No files found.
CHANGELOG.md
View file @
0ab4f20a
...
...
@@ -2,6 +2,12 @@
*
Guided Tour
*
Sources form highlighting crawlers
## Version 3.0.6.8
*
REPEC Crawler (connection with https://multivac.iscpif.fr)
*
HAL Crawler (connection to https://hal.archives-ouvertes.fr/)
*
New Graph Feature: color nodes by growth
## Version 3.0.6.4
*
COOC SQL improved
...
...
gargantext/constants.py
View file @
0ab4f20a
...
...
@@ -181,8 +181,6 @@ def get_tagger(lang):
return
tagger
()
RESOURCETYPES
=
[
{
"type"
:
1
,
'name'
:
'Europresse'
,
...
...
@@ -199,7 +197,7 @@ RESOURCETYPES = [
'crawler'
:
None
,
},
{
'type'
:
3
,
'name'
:
'Pubmed [XML]'
,
'name'
:
'Pubmed [
CRAWLER/
XML]'
,
'format'
:
'Pubmed'
,
'parser'
:
"PubmedParser"
,
'file_formats'
:[
"zip"
,
"xml"
],
...
...
@@ -235,26 +233,43 @@ RESOURCETYPES = [
'crawler'
:
None
,
},
{
'type'
:
8
,
'name'
:
'ISTex'
,
'name'
:
'ISTex
[CRAWLER]
'
,
'format'
:
'json'
,
'parser'
:
"ISTexParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
},
{
"type"
:
9
,
"name"
:
'SCOAP [XML]'
,
"name"
:
'SCOAP [
CRAWLER/
XML]'
,
"parser"
:
"CernParser"
,
"format"
:
'MARC21'
,
'file_formats'
:[
"zip"
,
"xml"
],
"crawler"
:
"CernCrawler"
,
},
# { "type": 10,
# "name": 'REPEC [RIS]',
# "parser": "RISParser",
# "format": 'RIS',
# 'file_formats':["zip","ris", "txt"],
# "crawler": None,
# },
#
{
"type"
:
10
,
"name"
:
'REPEC [
RIS
]'
,
"parser"
:
"
RIS
Parser"
,
"format"
:
'
RIS
'
,
'file_formats'
:[
"zip"
,
"
ris"
,
"txt
"
],
"crawler"
:
None
,
"name"
:
'REPEC [
CRAWLER
]'
,
"parser"
:
"
Multivac
Parser"
,
"format"
:
'
JSON
'
,
'file_formats'
:[
"zip"
,
"
json
"
],
"crawler"
:
"MultivacCrawler"
,
},
{
"type"
:
11
,
"name"
:
'HAL [CRAWLER]'
,
"parser"
:
"HalParser"
,
"format"
:
'JSON'
,
'file_formats'
:[
"zip"
,
"json"
],
"crawler"
:
"HalCrawler"
,
},
]
#shortcut for resources declaration in template
PARSERS
=
[(
n
[
"type"
],
n
[
"name"
])
for
n
in
RESOURCETYPES
if
n
[
"parser"
]
is
not
None
]
...
...
gargantext/urls.py
View file @
0ab4f20a
...
...
@@ -28,19 +28,20 @@ import graph.urls
import
moissonneurs.urls
urlpatterns
=
[
url
(
r'^admin/'
,
admin
.
site
.
urls
)
,
url
(
r'^api/'
,
include
(
gargantext
.
views
.
api
.
urls
)
)
,
url
(
r'^'
,
include
(
gargantext
.
views
.
pages
.
urls
)
)
urlpatterns
=
[
url
(
r'^admin/'
,
admin
.
site
.
urls
)
,
url
(
r'^api/'
,
include
(
gargantext
.
views
.
api
.
urls
)
)
,
url
(
r'^'
,
include
(
gargantext
.
views
.
pages
.
urls
)
)
,
url
(
r'^favicon.ico$'
,
Redirect
.
as_view
(
url
=
static
.
url
(
'favicon.ico'
)
,
permanent
=
False
),
name
=
"favicon"
)
,
permanent
=
False
),
name
=
"favicon"
)
# Module Graph
,
url
(
r'^'
,
include
(
graph
.
urls
)
)
,
url
(
r'^'
,
include
(
graph
.
urls
)
)
# Module Annotation
# tempo: unchanged doc-annotations routes --
,
url
(
r'^annotations/'
,
include
(
annotations_urls
)
)
,
url
(
r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/(focus=[0-9,]+)?$'
,
annotations_main_view
)
,
url
(
r'^annotations/'
,
include
(
annotations_urls
)
)
,
url
(
r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/(focus=[0-9,]+)?$'
,
annotations_main_view
)
# Module Scrapers (Moissonneurs in French)
,
url
(
r'^moissonneurs/'
,
include
(
moissonneurs
.
urls
)
)
...
...
gargantext/util/crawlers/CERN.py
View file @
0ab4f20a
...
...
@@ -4,7 +4,7 @@
# ***** CERN Scrapper *****
# ****************************
# Author:c24b
# Date: 27/05/201
5
# Date: 27/05/201
6
import
hmac
,
hashlib
import
requests
import
os
...
...
@@ -96,10 +96,12 @@ class CernCrawler(Crawler):
print
(
self
.
results_nb
,
"res"
)
#self.generate_urls()
return
(
self
.
ids
)
def
generate_urls
(
self
):
''' generate raw urls of ONE record'''
self
.
urls
=
[
"http://repo.scoap3.org/record/
%
i/export/xm?ln=en"
%
rid
for
rid
in
self
.
ids
]
return
self
.
urls
def
fetch_records
(
self
,
ids
):
''' for NEXT time'''
raise
NotImplementedError
...
...
gargantext/util/crawlers/HAL.py
0 → 100644
View file @
0ab4f20a
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** HAL Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from
._Crawler
import
*
import
json
from
gargantext.constants
import
UPLOAD_DIRECTORY
from
math
import
trunc
from
gargantext.util.files
import
save
class
HalCrawler
(
Crawler
):
''' HAL API CLIENT'''
def
__init__
(
self
):
# Main EndPoints
self
.
BASE_URL
=
"https://api.archives-ouvertes.fr"
self
.
API_URL
=
"search"
# Final EndPoints
# TODO : Change endpoint according type of database
self
.
URL
=
self
.
BASE_URL
+
"/"
+
self
.
API_URL
self
.
status
=
[]
def
__format_query__
(
self
,
query
=
None
):
'''formating the query'''
#search_field="title_t"
search_field
=
"abstract_t"
return
(
search_field
+
":"
+
"("
+
query
+
")"
)
def
_get
(
self
,
query
,
fromPage
=
1
,
count
=
10
,
lang
=
None
):
# Parameters
fl
=
""" title_s
, abstract_s
, submittedDate_s
, journalDate_s
, authFullName_s
, uri_s
, isbn_s
, issue_s
, journalPublisher_s
"""
#, authUrl_s
#, type_s
wt
=
"json"
querystring
=
{
"q"
:
query
,
"rows"
:
count
,
"start"
:
fromPage
,
"fl"
:
fl
,
"wt"
:
wt
}
# Specify Headers
headers
=
{
"cache-control"
:
"no-cache"
}
# Do Request and get response
response
=
requests
.
request
(
"GET"
,
self
.
URL
,
headers
=
headers
,
params
=
querystring
)
#print(querystring)
# Validation : 200 if ok else raise Value
if
response
.
status_code
==
200
:
charset
=
(
response
.
headers
[
"Content-Type"
]
.
split
(
"; "
)[
1
]
.
split
(
"="
)[
1
]
)
return
(
json
.
loads
(
response
.
content
.
decode
(
charset
)))
else
:
raise
ValueError
(
response
.
status_code
,
response
.
reason
)
def
scan_results
(
self
,
query
):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self
.
results_nb
=
0
total
=
(
self
.
_get
(
query
)
.
get
(
"response"
,
{})
.
get
(
"numFound"
,
0
)
)
self
.
results_nb
=
total
return
self
.
results_nb
def
download
(
self
,
query
):
downloaded
=
False
self
.
status
.
append
(
"fetching results"
)
corpus
=
[]
paging
=
100
self
.
query_max
=
self
.
scan_results
(
query
)
#print("self.query_max : %s" % self.query_max)
if
self
.
query_max
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
self
.
query_max
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: Multivac d/l ): "
,
msg
)
self
.
query_max
=
QUERY_SIZE_N_MAX
#for page in range(1, trunc(self.query_max / 100) + 2):
for
page
in
range
(
0
,
self
.
query_max
,
paging
):
print
(
"Downloading page
%
s to
%
s results"
%
(
page
,
paging
))
docs
=
(
self
.
_get
(
query
,
fromPage
=
page
,
count
=
paging
)
.
get
(
"response"
,
{})
.
get
(
"docs"
,
[])
)
for
doc
in
docs
:
corpus
.
append
(
doc
)
self
.
path
=
save
(
json
.
dumps
(
corpus
)
.
encode
(
"utf-8"
)
,
name
=
'HAL.json'
,
basedir
=
UPLOAD_DIRECTORY
)
downloaded
=
True
return
downloaded
gargantext/util/crawlers/ISTEX.py
View file @
0ab4f20a
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from
._Crawler
import
*
import
json
...
...
gargantext/util/crawlers/MULTIVAC.py
0 → 100644
View file @
0ab4f20a
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from
._Crawler
import
*
import
json
from
gargantext.settings
import
API_TOKENS
from
gargantext.constants
import
UPLOAD_DIRECTORY
from
math
import
trunc
from
gargantext.util.files
import
save
class
MultivacCrawler
(
Crawler
):
''' Multivac API CLIENT'''
def
__init__
(
self
):
self
.
apikey
=
API_TOKENS
[
"MULTIVAC"
]
# Main EndPoints
self
.
BASE_URL
=
"https://api.iscpif.fr/v2"
self
.
API_URL
=
"pvt/economy/repec/search"
# Final EndPoints
# TODO : Change endpoint according type of database
self
.
URL
=
self
.
BASE_URL
+
"/"
+
self
.
API_URL
self
.
status
=
[]
def
__format_query__
(
self
,
query
=
None
):
'''formating the query'''
None
def
_get
(
self
,
query
,
fromPage
=
1
,
count
=
10
,
lang
=
None
):
# Parameters
querystring
=
{
"q"
:
query
,
"count"
:
count
,
"from"
:
fromPage
,
"api_key"
:
API_TOKENS
[
"MULTIVAC"
][
"APIKEY"
]
}
if
lang
is
not
None
:
querystring
[
"lang"
]
=
lang
# Specify Headers
headers
=
{
"cache-control"
:
"no-cache"
}
# Do Request and get response
response
=
requests
.
request
(
"GET"
,
self
.
URL
,
headers
=
headers
,
params
=
querystring
)
#print(querystring)
# Validation : 200 if ok else raise Value
if
response
.
status_code
==
200
:
charset
=
(
response
.
headers
[
"Content-Type"
]
.
split
(
"; "
)[
1
]
.
split
(
"="
)[
1
]
)
return
(
json
.
loads
(
response
.
content
.
decode
(
charset
)))
else
:
raise
ValueError
(
response
.
status_code
,
response
.
reason
)
def
scan_results
(
self
,
query
):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self
.
results_nb
=
0
total
=
(
self
.
_get
(
query
)
.
get
(
"results"
,
{})
.
get
(
"total"
,
0
)
)
self
.
results_nb
=
total
return
self
.
results_nb
def
download
(
self
,
query
):
downloaded
=
False
self
.
status
.
append
(
"fetching results"
)
corpus
=
[]
paging
=
100
self
.
query_max
=
self
.
scan_results
(
query
)
#print("self.query_max : %s" % self.query_max)
if
self
.
query_max
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
self
.
query_max
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: Multivac d/l ): "
,
msg
)
self
.
query_max
=
QUERY_SIZE_N_MAX
for
page
in
range
(
1
,
trunc
(
self
.
query_max
/
100
)
+
2
):
print
(
"Downloading page
%
s to
%
s results"
%
(
page
,
paging
))
docs
=
(
self
.
_get
(
query
,
fromPage
=
page
,
count
=
paging
)
.
get
(
"results"
,
{})
.
get
(
"hits"
,
[])
)
for
doc
in
docs
:
corpus
.
append
(
doc
)
self
.
path
=
save
(
json
.
dumps
(
corpus
)
.
encode
(
"utf-8"
)
,
name
=
'Multivac.json'
,
basedir
=
UPLOAD_DIRECTORY
)
downloaded
=
True
return
downloaded
gargantext/util/crawlers/_Crawler.py
View file @
0ab4f20a
# Scrapers config
QUERY_SIZE_N_MAX
=
1000
from
gargantext.constants
import
get_resource
from
gargantext.constants
import
get_resource
,
QUERY_SIZE_N_MAX
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.db
import
session
from
requests_futures.sessions
import
FuturesSession
...
...
@@ -18,31 +18,34 @@ class Crawler:
#the name of corpus
#that will be built in case of internal fileparsing
self
.
record
=
record
self
.
name
=
record
[
"corpus_name"
]
self
.
project_id
=
record
[
"project_id"
]
self
.
user_id
=
record
[
"user_id"
]
self
.
resource
=
record
[
"source"
]
self
.
type
=
get_resource
(
self
.
resource
)
self
.
query
=
record
[
"query"
]
self
.
record
=
record
self
.
name
=
record
[
"corpus_name"
]
self
.
project_id
=
record
[
"project_id"
]
self
.
user_id
=
record
[
"user_id"
]
self
.
resource
=
record
[
"source"
]
self
.
type
=
get_resource
(
self
.
resource
)
self
.
query
=
record
[
"query"
]
#format the sampling
self
.
n_last_years
=
5
self
.
YEAR
=
date
.
today
()
.
year
self
.
YEAR
=
date
.
today
()
.
year
#pas glop
# mais easy version
self
.
MONTH
=
str
(
date
.
today
()
.
month
)
self
.
MONTH
=
str
(
date
.
today
()
.
month
)
if
len
(
self
.
MONTH
)
==
1
:
self
.
MONTH
=
"0"
+
self
.
MONTH
self
.
MAX_RESULTS
=
1000
self
.
MAX_RESULTS
=
QUERY_SIZE_N_MAX
try
:
self
.
results_nb
=
int
(
record
[
"count"
])
except
KeyError
:
#n'existe pas encore
self
.
results_nb
=
0
try
:
self
.
webEnv
=
record
[
"webEnv"
]
self
.
webEnv
=
record
[
"webEnv"
]
self
.
queryKey
=
record
[
"queryKey"
]
self
.
retMax
=
record
[
"retMax"
]
self
.
retMax
=
record
[
"retMax"
]
except
KeyError
:
#n'exsite pas encore
self
.
queryKey
=
None
...
...
@@ -67,6 +70,7 @@ class Crawler:
if
self
.
download
():
self
.
create_corpus
()
return
self
.
corpus_id
def
get_sampling_dates
():
'''Create a sample list of min and max date based on Y and M f*
or N_LAST_YEARS results'''
...
...
gargantext/util/db.py
View file @
0ab4f20a
...
...
@@ -171,3 +171,6 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None, do_stat
cursor
.
execute
(
'COMMIT WORK;'
)
cursor
.
close
()
gargantext/util/generators/credits.py
View file @
0ab4f20a
...
...
@@ -8,29 +8,12 @@ import random
_members
=
[
{
'first_name'
:
'Constance'
,
'last_name'
:
'de Quatrebarbes'
,
'mail'
:
'4barbesATgmail.com'
,
'website'
:
'http://c24b.github.io/'
,
'picture'
:
'constance.jpg'
,
'role'
:
'developer'
},
{
'first_name'
:
'David'
,
'last_name'
:
'Chavalarias'
,
'mail'
:
'david.chavalariasATiscpif.fr'
,
'website'
:
'http://chavalarias.com'
,
'picture'
:
'david.jpg'
,
'role'
:
'principal investigator'
},
# { 'first_name' : 'Elias', 'last_name' : 'Showk',
# 'mail' : '',
# 'website' : 'https://github.com/elishowk',
# 'picture' : '', 'role' : 'developer'},
{
'first_name'
:
'Mathieu'
,
'last_name'
:
'Rodic'
,
'mail'
:
''
,
'website'
:
'http://rodic.fr'
,
'picture'
:
'mathieu.jpg'
,
'role'
:
'developer'
},
{
'first_name'
:
'Samuel'
,
'last_name'
:
'Castillo J.'
,
'mail'
:
'kaisleanATgmail.com'
,
'website'
:
'http://www.pksm3.droppages.com'
,
...
...
@@ -43,12 +26,6 @@ _members = [
'picture'
:
'maziyar.jpg'
,
'role'
:
'developer'
},
{
'first_name'
:
'Romain'
,
'last_name'
:
'Loth'
,
'mail'
:
''
,
'website'
:
'http://iscpif.fr'
,
'picture'
:
'romain.jpg'
,
'role'
:
'developer'
},
{
'first_name'
:
'Alexandre'
,
'last_name'
:
'Delanoë'
,
'mail'
:
'alexandre+gargantextATdelanoe.org'
,
'website'
:
'http://alexandre.delanoe.org'
,
...
...
@@ -59,8 +36,33 @@ _members = [
# copy-paste the line above and write your informations please
]
_membersPast
=
[
{
'first_name'
:
'Constance'
,
'last_name'
:
'de Quatrebarbes'
,
'mail'
:
'4barbesATgmail.com'
,
'website'
:
'http://c24b.github.io/'
,
'picture'
:
'constance.jpg'
,
'role'
:
'developer'
},
{
'first_name'
:
'Mathieu'
,
'last_name'
:
'Rodic'
,
'mail'
:
''
,
'website'
:
'http://rodic.fr'
,
'picture'
:
'mathieu.jpg'
,
'role'
:
'developer'
},
{
'first_name'
:
'Romain'
,
'last_name'
:
'Loth'
,
'mail'
:
''
,
'website'
:
'http://iscpif.fr'
,
'picture'
:
'romain.jpg'
,
'role'
:
'developer'
},
{
'first_name'
:
'Elias'
,
'last_name'
:
'Showk'
,
'mail'
:
''
,
'website'
:
'https://github.com/elishowk'
,
'picture'
:
''
,
'role'
:
'developer'
},
]
_institutions
=
[
#
{ 'name' : 'Mines ParisTech', 'website' : 'http://mines-paristech.fr', 'picture' : 'mines.png', 'funds':''},
{
'name'
:
'Mines ParisTech'
,
'website'
:
'http://mines-paristech.fr'
,
'picture'
:
'mines.png'
,
'funds'
:
''
},
#{ 'name' : 'Institut Pasteur', 'website' : 'http://www.pasteur.fr', 'picture' : 'pasteur.png', 'funds':''},
{
'name'
:
'EHESS'
,
'website'
:
'http://www.ehess.fr'
,
'picture'
:
'ehess.png'
,
'funds'
:
''
},
#{ 'name' : '', 'website' : '', 'picture' : '', 'funds':''},
...
...
@@ -87,6 +89,10 @@ def members():
random
.
shuffle
(
_members
)
return
_members
def
membersPast
():
random
.
shuffle
(
_membersPast
)
return
_membersPast
def
institutions
():
random
.
shuffle
(
_institutions
)
return
_institutions
...
...
gargantext/util/parsers/HAL.py
0 → 100644
View file @
0ab4f20a
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** HAL Parser ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from
._Parser
import
Parser
from
datetime
import
datetime
import
json
class
HalParser
(
Parser
):
def
parse
(
self
,
filebuf
):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents
=
filebuf
.
read
()
.
decode
(
"UTF-8"
)
data
=
json
.
loads
(
contents
)
filebuf
.
close
()
json_docs
=
data
hyperdata_list
=
[]
hyperdata_path
=
{
"id"
:
"isbn_s"
,
"title"
:
"title_s"
,
"abstract"
:
"abstract_s"
,
"source"
:
"journalPublisher_s"
,
"url"
:
"uri_s"
,
"authors"
:
"authFullName_s"
}
uris
=
set
()
for
doc
in
json_docs
:
hyperdata
=
{}
for
key
,
path
in
hyperdata_path
.
items
():
field
=
doc
.
get
(
path
,
"NOT FOUND"
)
if
isinstance
(
field
,
list
):
hyperdata
[
key
]
=
", "
.
join
(
field
)
else
:
hyperdata
[
key
]
=
field
if
hyperdata
[
"url"
]
in
uris
:
print
(
"Document already parsed"
)
else
:
uris
.
add
(
hyperdata
[
"url"
])
# hyperdata["authors"] = ", ".join(
# [ p.get("person", {})
# .get("name" , "")
#
# for p in doc.get("hasauthor", [])
# ]
# )
#
maybeDate
=
doc
.
get
(
"submittedDate_s"
,
None
)
if
maybeDate
is
not
None
:
date
=
datetime
.
strptime
(
maybeDate
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
else
:
date
=
datetime
.
now
()
hyperdata
[
"publication_date"
]
=
date
hyperdata
[
"publication_year"
]
=
str
(
date
.
year
)
hyperdata
[
"publication_month"
]
=
str
(
date
.
month
)
hyperdata
[
"publication_day"
]
=
str
(
date
.
day
)
hyperdata_list
.
append
(
hyperdata
)
return
hyperdata_list
gargantext/util/parsers/ISTEX.py
View file @
0ab4f20a
...
...
@@ -13,20 +13,21 @@ class ISTexParser(Parser):
hyperdata_list
=
[]
hyperdata_path
=
{
"id"
:
"id"
,
"source"
:
'corpusName'
,
"title"
:
'title'
,
"source"
:
"corpusName"
,
"title"
:
"title"
,
"genre"
:
"genre"
,
"language_iso3"
:
'language'
,
"doi"
:
'doi'
,
"host"
:
'host'
,
"publication_date"
:
'publicationDate'
,
"abstract"
:
'abstract'
,
"language_iso3"
:
"language"
,
"doi"
:
"doi"
,
"host"
:
"host"
,
"publication_date"
:
"publicationDate"
,
"abstract"
:
"abstract"
,
# "authors" : 'author',
"authorsRAW"
:
'author'
,
"authorsRAW"
:
"author"
,
#"keywords" : "keywords"
}
suma
=
0
for
json_doc
in
json_docs
:
hyperdata
=
{}
...
...
@@ -103,7 +104,7 @@ class ISTexParser(Parser):
RealDate
=
RealDate
[
0
]
# print( RealDate ," | length:",len(RealDate))
Decision
=
""
Decision
=
True
if
len
(
RealDate
)
>
4
:
if
len
(
RealDate
)
>
8
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y-
%
b-
%
d'
)
.
date
()
...
...
gargantext/util/parsers/MULTIVAC.py
0 → 100644
View file @
0ab4f20a
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Parser ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from
._Parser
import
Parser
from
datetime
import
datetime
import
json
class
MultivacParser
(
Parser
):
def
parse
(
self
,
filebuf
):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents
=
filebuf
.
read
()
.
decode
(
"UTF-8"
)
data
=
json
.
loads
(
contents
)
filebuf
.
close
()
json_docs
=
data
hyperdata_list
=
[]
hyperdata_path
=
{
"id"
:
"id"
,
"title"
:
"title"
,
"abstract"
:
"abstract"
,
"type"
:
"type"
}
for
json_doc
in
json_docs
:
hyperdata
=
{}
doc
=
json_doc
[
"_source"
]
for
key
,
path
in
hyperdata_path
.
items
():
hyperdata
[
key
]
=
doc
.
get
(
path
,
""
)
hyperdata
[
"source"
]
=
doc
.
get
(
"serial"
,
{})
\
.
get
(
"journaltitle"
,
"REPEC Database"
)
try
:
hyperdata
[
"url"
]
=
doc
.
get
(
"file"
,
{})
\
.
get
(
"url"
,
""
)
except
:
pass
hyperdata
[
"authors"
]
=
", "
.
join
(
[
p
.
get
(
"person"
,
{})
.
get
(
"name"
,
""
)
for
p
in
doc
.
get
(
"hasauthor"
,
[])
]
)
year
=
doc
.
get
(
"serial"
,
{})
\
.
get
(
"issuedate"
,
None
)
if
year
==
"Invalide date"
:
year
=
doc
.
get
(
"issuedate"
,
None
)
if
year
is
None
:
year
=
datetime
.
now
()
else
:
try
:
date
=
datetime
.
strptime
(
year
,
'
%
Y'
)
except
:
print
(
"FIX DATE MULTIVAC REPEC
%
s"
%
year
)
date
=
datetime
.
now
()
hyperdata
[
"publication_date"
]
=
date
hyperdata
[
"publication_year"
]
=
str
(
date
.
year
)
hyperdata
[
"publication_month"
]
=
str
(
date
.
month
)
hyperdata
[
"publication_day"
]
=
str
(
date
.
day
)
hyperdata_list
.
append
(
hyperdata
)
return
hyperdata_list
gargantext/util/parsers/PUBMED.py
View file @
0ab4f20a
...
...
@@ -78,7 +78,7 @@ class PubmedParser(Parser):
if
"publication_month"
in
hyperdata
:
PubmedDate
+=
" "
+
hyperdata
[
"publication_month"
]
if
"publication_day"
in
hyperdata
:
PubmedDate
+=
" "
+
hyperdata
[
"publication_day"
]
Decision
=
""
Decision
=
True
if
len
(
RealDate
)
>
4
:
if
len
(
RealDate
)
>
8
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b
%
d'
)
.
date
()
...
...
gargantext/util/toolchain/metric_tfidf.py
View file @
0ab4f20a
...
...
@@ -109,7 +109,7 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
.
group_by
(
"counted_form"
)
)
#print(str(occs_q))
#print(str(occs_q
.all()
))
occ_sums
=
occs_q
.
all
()
# example result = [(1970, 1.0), (2024, 2.0), (259, 2.0), (302, 1.0), ... ]
# ^^^^ ^^^
...
...
@@ -177,6 +177,7 @@ def compute_ti_ranking(corpus,
- overwrite_id: optional id of a pre-existing XXXX node for this corpus
(the Node and its previous Node NodeNgram rows will be replaced)
"""
print
(
"compute_ti_ranking"
)
# validate string params
if
count_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: count_scope param allowed values: 'local', 'global'"
)
...
...
@@ -189,7 +190,7 @@ def compute_ti_ranking(corpus,
if
type
(
corpus
)
==
int
:
corpus_id
=
corpus
corpus
=
cache
.
Node
[
corpus_id
]
elif
type
(
corpus
)
==
str
and
match
(
r'\d+$'
,
corpus
):
elif
type
(
corpus
)
==
str
and
match
(
r'
^
\d+$'
,
corpus
):
corpus_id
=
int
(
corpus
)
corpus
=
cache
.
Node
[
corpus_id
]
else
:
...
...
@@ -329,7 +330,7 @@ def compute_ti_ranking(corpus,
# result
print
(
"
%
s : Starting Query tf_nd_query"
%
t
())
print
(
str
(
tf_nd_query
))
#print(str(tf_nd_query.all()
))
tf_nd
=
tf_nd_query
.
all
()
print
(
"
%
s : End Query tf_nd_quer"
%
t
())
...
...
@@ -371,7 +372,7 @@ def compute_ti_ranking(corpus,
# TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL
# TODO 3 recreate them elsewhere in their sims (WeightedIndex) version
# TODO 4 requalify this here as a NodeNgram
#
then TODO 5 use WeightedList.save() !
#
TODO 5 use WeightedList.save()
# reflect that in NodeNodeNgrams
bulk_insert
(
...
...
@@ -398,7 +399,8 @@ def compute_tfidf_local(corpus,
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
print
(
"Compute TFIDF local"
)
# All docs of this corpus
docids_subquery
=
(
session
.
query
(
Node
.
id
)
...
...
gargantext/util/toolchain/ngram_coocs.py
View file @
0ab4f20a
...
...
@@ -3,9 +3,9 @@ COOCS
(this is the full SQL version, should be more reliable on outerjoin)
"""
from
gargantext
import
settings
from
sqlalchemy
import
create_engine
from
sqlalchemy
import
exc
from
gargantext.util.lists
import
WeightedMatrix
# from gargantext.util.db import session, aliased, func
from
gargantext.util.db
import
get_engine
from
gargantext.util.db_cache
import
cache
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
,
NODETYPES
from
gargantext.constants
import
INDEXED_HYPERDATA
...
...
@@ -64,12 +64,7 @@ def compute_coocs( corpus,
"""
# 1) prepare direct connection to the DB
url
=
'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'
.
format
(
**
settings
.
DATABASES
[
'default'
]
)
engine
=
create_engine
(
url
)
connection
=
engine
.
connect
()
connection
=
get_engine
()
.
connect
()
# string vars for our SQL query
# setting work memory high to improve cache perf.
...
...
@@ -223,10 +218,19 @@ def compute_coocs( corpus,
# 6) EXECUTE QUERY
# ----------------
# debug
print
(
final_sql
)
#
print(final_sql)
# executing the SQL statement
results
=
connection
.
execute
(
final_sql
)
try
:
# suppose the database has been restarted.
results
=
connection
.
execute
(
final_sql
)
connection
.
close
()
except
exc
.
DBAPIError
as
e
:
# an exception is raised, Connection is invalidated.
if
e
.
connection_invalidated
:
print
(
"Connection was invalidated for ngram_coocs"
)
else
:
print
(
e
)
# => storage in our matrix structure
matrix
=
WeightedMatrix
(
results
)
...
...
gargantext/views/pages/main.py
View file @
0ab4f20a
...
...
@@ -47,7 +47,8 @@ def about(request):
context
=
{
'user'
:
request
.
user
,
'date'
:
datetime
.
datetime
.
now
(),
'team'
:
credits
.
members
(),
'team'
:
credits
.
members
(),
'teamPast'
:
credits
.
membersPast
(),
'institutions'
:
credits
.
institutions
(),
'labos'
:
credits
.
labs
(),
'grants'
:
credits
.
grants
(),
...
...
graph/graph.py
View file @
0ab4f20a
...
...
@@ -8,6 +8,7 @@ from graph.cooccurrences import countCooccurrences
from
graph.distances
import
clusterByDistances
from
graph.bridgeness
import
filterByBridgeness
from
graph.mail_notification
import
notify_owner
from
graph.growth
import
compute_growth
from
gargantext.util.scheduling
import
scheduled
from
gargantext.constants
import
graph_constraints
...
...
@@ -64,7 +65,15 @@ def compute_graph( corpus_id=None , cooc_id=None
print
(
"GRAPH #
%
d ... Filtering by bridgeness
%
d."
%
(
cooc_id
,
bridgeness
))
data
=
filterByBridgeness
(
G
,
partition
,
ids
,
weight
,
bridgeness
,
"node_link"
,
field1
,
field2
)
if
start
is
not
None
and
end
is
not
None
:
growth
=
dict
()
for
(
ng_id
,
score
)
in
compute_growth
(
corpus_id
,
groupList_id
,
mapList_id
,
start
,
end
):
growth
[
ng_id
]
=
float
(
score
)
+
100
# for the normalization, should not be negativ
for
node
in
data
[
'nodes'
]:
node
[
'attributes'
][
'growth'
]
=
growth
[
node
[
'id'
]]
print
(
"GRAPH #
%
d ... Saving Graph in hyperdata as json."
%
cooc_id
)
node
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
cooc_id
)
.
first
()
...
...
@@ -187,7 +196,7 @@ def get_graph( request=None , corpus=None
)
.
filter
(
Start
.
key
==
'publication_date'
)
.
filter
(
Start
.
value_utc
>=
date_start_utc
)
)
)
# Filter corpus by date if any end date
...
...
@@ -203,8 +212,7 @@ def get_graph( request=None , corpus=None
)
.
filter
(
End
.
key
==
'publication_date'
)
.
filter
(
End
.
value_utc
<=
date_end_utc
)
)
)
# Finally test if the size of the corpora is big enough
# --------------------------------
...
...
@@ -221,10 +229,11 @@ def get_graph( request=None , corpus=None
#, limit=size
)
return
{
"state"
:
"saveOnly"
,
"target_id"
:
cooc_id
,
"target_name"
:
cooc_name
,
"target_date"
:
cooc_date
}
return
{
"state"
:
"saveOnly"
,
"target_id"
:
cooc_id
,
"target_name"
:
cooc_name
,
"target_date"
:
cooc_date
}
elif
corpus_size
>
graph_constraints
[
'corpusMax'
]:
# Then compute cooc asynchronously with celery
...
...
@@ -262,5 +271,5 @@ def get_graph( request=None , corpus=None
if
len
(
data
)
==
0
:
print
(
"GRAPH # ... GET_GRAPH: 0 coocs in matrix"
)
data
=
{
'nodes'
:[],
'links'
:[]}
# empty data
return
data
graph/growth.py
0 → 100644
View file @
0ab4f20a
"""
Computes ngram growth on periods
"""
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.util.db_cache
import
cache
from
gargantext.util.db
import
session
,
bulk_insert
,
aliased
,
\
func
,
get_engine
# = sqlalchemy.func like sum() or count()
from
datetime
import
datetime
def
timeframes
(
start
,
end
):
"""
timeframes :: String -> String -> (UTCTime, UTCTime, UTCTime)
"""
start
=
datetime
.
strptime
(
str
(
start
),
"
%
Y-
%
m-
%
d"
)
end
=
datetime
.
strptime
(
str
(
end
),
"
%
Y-
%
m-
%
d"
)
date_0
=
start
-
(
end
-
start
)
date_1
=
start
date_2
=
end
return
(
date_0
,
date_1
,
date_2
)
def
compute_growth
(
corpus_id
,
groupList_id
,
mapList_id
,
start
,
end
):
"""
compute_graph :: Int -> UTCTime -> UTCTime -> Int -> Int
-> [(Int, Numeric)]
this function uses SQL function in
/srv/gargantext/install/gargamelle/sqlFunctions.sql
First compute occurrences of ngrams in mapList (with groups) on the first
period, then on the second and finally returns growth.
Directly computed with Postgres Database (C) for optimization.
"""
connection
=
get_engine
()
(
date_0
,
date_1
,
date_2
)
=
timeframes
(
start
,
end
)
query
=
"""SELECT * FROM OCC_HIST( {corpus_id}
, {groupList_id}
, {mapList_id}
, '{date_0}'
, '{date_1}'
, '{date_2}'
)
"""
.
format
(
corpus_id
=
corpus_id
,
groupList_id
=
groupList_id
,
mapList_id
=
mapList_id
,
date_0
=
date_0
,
date_1
=
date_1
,
date_2
=
date_2
)
return
(
connection
.
execute
(
query
))
graph/utils.py
View file @
0ab4f20a
...
...
@@ -19,6 +19,8 @@ def compress_graph(graphdata):
for
node
in
graphdata
[
'nodes'
]:
node
[
'lb'
]
=
node
[
'label'
]
del
node
[
'label'
]
#node['attributes']['growth'] = 0.8
node
[
'at'
]
=
node
[
'attributes'
]
del
node
[
'attributes'
]
...
...
install/gargamelle/Debian.sh
View file @
0ab4f20a
...
...
@@ -5,13 +5,10 @@ apt-get install -y \
apt-utils ca-certificates locales
\
sudo
aptitude gcc g++ wget git vim
\
build-essential make
\
postgresql-9.5 postgresql-client-9.5 postgresql-contrib-9.5
\
postgresql-server-dev-9.5 libpq-dev libxml2
\
postgresql-9.5 postgresql-client-9.5 postgresql-contrib-9.5
\
nginx rabbitmq-server
# WARNING: uwsgi is not on stretch any more (get it from unstable)
# uwsgi uwsgi-core uwsgi-plugin-python3
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6
\
postgresql-server-dev-9.6 libpq-dev libxml2
\
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6
\
nginx rabbitmq-server uwsgi uwsgi-core uwsgi-plugin-python3
### Configure timezone and locale
...
...
@@ -32,15 +29,15 @@ update-locale LC_ALL=fr_FR.UTF-8
### Install main dependencies and python packages based on Debian distrib
echo
"############# PYTHON DEPENDENCIES ###############"
apt-get update
&&
apt-get
install
-y
\
libxml2-dev xml-core libgfortran-
5
-dev
\
libxml2-dev xml-core libgfortran-
6
-dev
\
libpq-dev
\
python3.5
\
python3-dev
\
python3-six python3-numpy python3-setuptools
\
python3-numexpr
\
python3-pip
\
libxml2-dev libxslt-dev
#libxslt1-dev
zlib1g-dev
libxml2-dev libxslt-dev
zlib1g-dev
#libxslt1-dev
UPDATE AND CLEAN
apt-get update
&&
apt-get autoclean
...
...
@@ -70,7 +67,7 @@ update-locale LC_ALL=fr_FR.UTF-8
## POSTGRESQL DATA (as ROOT)
#######################################################################
sed
-iP
"s%^data_directory.*%data_directory =
\'\/
srv
\/
gargandata
\'
%"
/etc/postgresql/9.
5
/main/postgresql.conf
echo
"host all all 0.0.0.0/0 md5"
>>
/etc/postgresql/9.
5
/main/pg_hba.conf
echo
"listen_addresses='*'"
>>
/etc/postgresql/9.
5
/main/postgresql.conf
sed
-iP
"s%^data_directory.*%data_directory =
\'\/
srv
\/
gargandata
\'
%"
/etc/postgresql/9.
6
/main/postgresql.conf
echo
"host all all 0.0.0.0/0 md5"
>>
/etc/postgresql/9.
6
/main/pg_hba.conf
echo
"listen_addresses='*'"
>>
/etc/postgresql/9.
6
/main/postgresql.conf
install/gargamelle/psqlFunctions.sql
0 → 100644
View file @
0ab4f20a
-- CNRS Copyrights 2017
-- See Gargantext Licence for details
-- Maintainers: team@gargantext.org
-- USAGE
-- psql gargandb < occ_growth.sql
-- OCC_HIST :: Corpus.id -> GroupList.id -> MapList.id -> Start -> EndFirst -> EndLast
-- EXEMPLE USAGE
-- SELECT * FROM OCC_HIST(182856, 183859, 183866, '1800-03-15 17:00:00+01', '2000-03-15 17:00:00+01', '2017-03-15 17:00:00+01')
-- OCC_HIST_PART :: Corpus.id -> GroupList.id -> Start -> End
DROP
FUNCTION
OCC_HIST_PART
(
integer
,
integer
,
timestamp
without
time
zone
,
timestamp
without
time
zone
);
-- DROP for tests
CREATE
OR
REPLACE
FUNCTION
OCC_HIST_PART
(
int
,
int
,
timestamp
,
timestamp
)
RETURNS
TABLE
(
ng_id
int
,
score
float8
)
AS
$$
-- EXPLAIN ANALYZE
SELECT
COALESCE
(
gr
.
ngram1_id
,
ng1
.
ngram_id
)
as
ng_id
,
SUM
(
ng1
.
weight
)
as
score
from
nodes
n
-- BEFORE
INNER
JOIN
nodes
as
n1
ON
n1
.
id
=
n
.
id
INNER
JOIN
nodes_ngrams
ng1
ON
ng1
.
node_id
=
n1
.
id
-- Limit with timestamps: ]start, end]
INNER
JOIN
nodes_hyperdata
nh1
ON
nh1
.
node_id
=
n1
.
id
AND
nh1
.
value_utc
>
$
3
AND
nh1
.
value_utc
<=
$
4
-- Group List
LEFT
JOIN
nodes_ngrams_ngrams
gr
ON
ng1
.
ngram_id
=
gr
.
ngram2_id
AND
gr
.
node_id
=
$
2
WHERE
n
.
typename
=
4
AND
n
.
parent_id
=
$
1
GROUP
BY
1
$$
LANGUAGE
SQL
;
DROP
FUNCTION
OCC_HIST
(
integer
,
integer
,
integer
,
timestamp
without
time
zone
,
timestamp
without
time
zone
,
timestamp
without
time
zone
);
-- OCC_HIST :: Corpus.id -> GroupList.id -> MapList.id -> Start -> EndFirst -> EndLast
CREATE
OR
REPLACE
FUNCTION
OCC_HIST
(
int
,
int
,
int
,
timestamp
,
timestamp
,
timestamp
)
RETURNS
TABLE
(
ng_id
int
,
score
numeric
)
AS
$$
WITH
OCC1
as
(
SELECT
*
from
OCC_HIST_PART
(
$
1
,
$
2
,
$
4
,
$
5
))
,
OCC2
as
(
SELECT
*
from
OCC_HIST_PART
(
$
1
,
$
2
,
$
5
,
$
6
))
,
GROWTH
as
(
SELECT
ml
.
ngram_id
as
ngram_id
,
COALESCE
(
OCC1
.
score
,
null
)
as
score1
,
COALESCE
(
OCC2
.
score
,
null
)
as
score2
FROM
nodes_ngrams
ml
LEFT
JOIN
OCC1
ON
OCC1
.
ng_id
=
ml
.
ngram_id
LEFT
JOIN
OCC2
ON
OCC2
.
ng_id
=
ml
.
ngram_id
WHERE
ml
.
node_id
=
$
3
ORDER
by
score2
DESC
)
SELECT
ngram_id
,
COALESCE
(
ROUND
(
CAST
((
100
*
(
score2
-
score1
)
/
COALESCE
((
score2
+
score1
),
1
))
as
numeric
),
2
),
0
)
from
GROWTH
$$
LANGUAGE
SQL
;
-- BEHAVIORAL TEST (should be equal to occ in terms table)
-- WITH OCC as (SELECT * from OCC_HIST(182856, 183859, '1800-03-15 17:00:00+01', '2300-03-15 17:00:00+01'))
-- SELECT ng_id, score from OCC
-- INNER JOIN nodes_ngrams ml on ml.ngram_id = ng_id
-- AND ml.node_id = 183866
-- ORDER BY score DESC;
install/gargamelle/psql_configure.sh
View file @
0ab4f20a
...
...
@@ -12,12 +12,12 @@ echo "::::: POSTGRESQL :::::"
su postgres
-c
'pg_dropcluster 9.4 main --stop'
#done in docker but redoing it
rm
-rf
/srv/gargandata
&&
mkdir
/srv/gargandata
&&
chown
postgres:postgres /srv/gargandata
su postgres
-c
'/usr/lib/postgresql/9.
5
/bin/initdb -D /srv/gargandata/'
su postgres
-c
'/usr/lib/postgresql/9.
5
/bin/pg_ctl -D /srv/gargandata/ -l /srv/gargandata/journal_applicatif start'
su postgres
-c
'/usr/lib/postgresql/9.
6
/bin/initdb -D /srv/gargandata/'
su postgres
-c
'/usr/lib/postgresql/9.
6
/bin/pg_ctl -D /srv/gargandata/ -l /srv/gargandata/journal_applicatif start'
su postgres
-c
'pg_createcluster -D /srv/gargandata 9.
5
main '
su postgres
-c
'pg_ctlcluster -D /srv/gargandata 9.
5
main start '
su postgres
-c
'pg_ctlcluster 9.
5
main start'
su postgres
-c
'pg_createcluster -D /srv/gargandata 9.
6
main '
su postgres
-c
'pg_ctlcluster -D /srv/gargandata 9.
6
main start '
su postgres
-c
'pg_ctlcluster 9.
6
main start'
service postgresql start
...
...
moissonneurs/hal.py
0 → 100644
View file @
0ab4f20a
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** HAL Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_HAL
=
11
from
django.shortcuts
import
redirect
,
render
from
django.http
import
Http404
,
HttpResponseRedirect
\
,
HttpResponseForbidden
from
gargantext.constants
import
get_resource
,
load_crawler
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.db_cache
import
cache
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
def
query
(
request
):
'''get GlobalResults()'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
source
=
get_resource
(
RESOURCE_TYPE_HAL
)
if
source
[
"crawler"
]
is
not
None
:
crawlerbot
=
load_crawler
(
source
)()
#old raw way to get results_nb
results
=
crawlerbot
.
scan_results
(
query
)
#ids = crawlerbot.get_ids(query)
print
(
results
)
return
JsonHttpResponse
({
"results_nb"
:
crawlerbot
.
results_nb
})
def
save
(
request
,
project_id
):
'''save'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
.
get
(
"query"
)
try
:
N
=
int
(
request
.
POST
.
get
(
"N"
))
except
:
N
=
0
print
(
query
,
N
)
#for next time
#ids = request.POST["ids"]
source
=
get_resource
(
RESOURCE_TYPE_HAL
)
if
N
==
0
:
raise
Http404
()
if
N
>
QUERY_SIZE_N_MAX
:
N
=
QUERY_SIZE_N_MAX
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
first
()
if
project
is
None
:
raise
Http404
()
user
=
cache
.
User
[
request
.
user
.
id
]
if
not
user
.
owns
(
project
):
return
HttpResponseForbidden
()
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scrapping data"
}
)
#download_file
crawler_bot
=
load_crawler
(
source
)()
#for now no way to force downloading X records
#the long running command
filename
=
crawler_bot
.
download
(
query
)
corpus
.
add_resource
(
type
=
source
[
"type"
]
#, name = source["name"]
,
path
=
crawler_bot
.
path
)
session
.
add
(
corpus
)
session
.
commit
()
#corpus_id = corpus.id
try
:
scheduled
(
parse_extract_indexhyperdata
)(
corpus
.
id
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
try
:
print_tb
(
error
.
__traceback__
)
except
:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session
.
rollback
()
# --------------------------------------------
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
context
=
{
'user'
:
request
.
user
,
'project'
:
project
,
},
)
data
=
[
query_string
,
query
,
N
]
print
(
data
)
return
JsonHttpResponse
(
data
)
moissonneurs/multivac.py
0 → 100644
View file @
0ab4f20a
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** MULTIVAC Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_MULTIVAC
=
10
from
django.shortcuts
import
redirect
,
render
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
get_resource
,
load_crawler
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.db_cache
import
cache
from
gargantext.util.http
import
JsonHttpResponse
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
def
query
(
request
):
'''get GlobalResults()'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
source
=
get_resource
(
RESOURCE_TYPE_MULTIVAC
)
if
source
[
"crawler"
]
is
not
None
:
crawlerbot
=
load_crawler
(
source
)()
#old raw way to get results_nb
results
=
crawlerbot
.
scan_results
(
query
)
#ids = crawlerbot.get_ids(query)
print
(
results
)
return
JsonHttpResponse
({
"results_nb"
:
crawlerbot
.
results_nb
})
def
save
(
request
,
project_id
):
'''save'''
if
request
.
method
==
"POST"
:
query
=
request
.
POST
.
get
(
"query"
)
try
:
N
=
int
(
request
.
POST
.
get
(
"N"
))
except
:
N
=
0
print
(
query
,
N
)
#for next time
#ids = request.POST["ids"]
source
=
get_resource
(
RESOURCE_TYPE_MULTIVAC
)
if
N
==
0
:
raise
Http404
()
if
N
>
QUERY_SIZE_N_MAX
:
N
=
QUERY_SIZE_N_MAX
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
first
()
if
project
is
None
:
raise
Http404
()
user
=
cache
.
User
[
request
.
user
.
id
]
if
not
user
.
owns
(
project
):
return
HttpResponseForbidden
()
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scrapping data"
,
"language_id"
:
"en"
}
)
#download_file
crawler_bot
=
load_crawler
(
source
)()
#for now no way to force downloading X records
#the long running command
filename
=
crawler_bot
.
download
(
query
)
corpus
.
add_resource
(
type
=
source
[
"type"
]
#, name = source["name"]
,
path
=
crawler_bot
.
path
)
session
.
add
(
corpus
)
session
.
commit
()
#corpus_id = corpus.id
try
:
scheduled
(
parse_extract_indexhyperdata
)(
corpus
.
id
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
try
:
print_tb
(
error
.
__traceback__
)
except
:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session
.
rollback
()
# --------------------------------------------
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
context
=
{
'user'
:
request
.
user
,
'project'
:
project
,
},
)
data
=
[
query_string
,
query
,
N
]
print
(
data
)
return
JsonHttpResponse
(
data
)
moissonneurs/urls.py
View file @
0ab4f20a
...
...
@@ -18,24 +18,31 @@
from
django.conf.urls
import
url
import
moissonneurs.pubmed
as
pubmed
import
moissonneurs.istex
as
istex
import
moissonneurs.cern
as
cern
import
moissonneurs.pubmed
as
pubmed
import
moissonneurs.istex
as
istex
import
moissonneurs.cern
as
cern
import
moissonneurs.multivac
as
multivac
import
moissonneurs.hal
as
hal
# TODO
#import moissonneurs.hal as hal
#import moissonneurs.revuesOrg as revuesOrg
# TODO ?
# REST API for the moissonneurs
# TODO : ISIDORE
# /!\ urls patterns here are *without* the trailing slash
urlpatterns
=
[
url
(
r'^pubmed/query$'
,
pubmed
.
query
)
,
url
(
r'^pubmed/save/(\d+)'
,
pubmed
.
save
)
,
url
(
r'^istex/query$'
,
istex
.
query
)
,
url
(
r'^istex/save/(\d+)'
,
istex
.
save
)
,
url
(
r'^cern/query$'
,
cern
.
query
)
,
url
(
r'^cern/save/(\d+)'
,
cern
.
save
)
urlpatterns
=
[
url
(
r'^pubmed/query$'
,
pubmed
.
query
)
,
url
(
r'^pubmed/save/(\d+)'
,
pubmed
.
save
)
,
url
(
r'^istex/query$'
,
istex
.
query
)
,
url
(
r'^istex/save/(\d+)'
,
istex
.
save
)
,
url
(
r'^cern/query$'
,
cern
.
query
)
,
url
(
r'^cern/save/(\d+)'
,
cern
.
save
)
,
url
(
r'^multivac/query$'
,
multivac
.
query
)
,
url
(
r'^multivac/save/(\d+)'
,
multivac
.
save
)
,
url
(
r'^hal/query$'
,
hal
.
query
)
,
url
(
r'^hal/save/(\d+)'
,
hal
.
save
)
#, url(r'^isidore/query$' , isidore.query )
#, url(r'^isidore/save/(\d+)' , isidore.save )
]
templates/pages/main/about.html
View file @
0ab4f20a
...
...
@@ -183,9 +183,55 @@
</div>
</div>
</div>
{% endif %}
{% if teamPast %}
<div
class=
"panel panel-default"
>
<div
class=
"panel-heading"
>
<h2
class=
"panel-title"
>
<a
data-toggle=
"collapse"
data-parent=
"#accordion"
href=
"#collapseTeamPast"
>
<center>
<h2>
<span
class=
"glyphicon glyphicon-question-sign"
aria-hidden=
"true"
></span>
Former Developers
<span
class=
"glyphicon glyphicon-question-sign"
aria-hidden=
"true"
></span>
</h2>
</center>
</a>
</h2>
</div>
<div
id=
"collapseTeamPast"
class=
"panel-collapse collapse"
role=
"tabpanel"
>
<div
class=
"panel-body"
>
<div
class=
"container"
>
<div
class=
"row"
>
<div
class=
"thumbnails"
>
{% for member in teamPast %}
<div
class=
"col-md-5 "
>
<div
class=
"thumbnail"
>
<div
class=
"caption"
>
<center>
<h3>
{{ member.first_name }} {{member.last_name }}
</h3>
{% if member.role %}
<p
class=
"description"
>
{{ member.role }}
</p>
{% endif %}
</center>
</div>
</div>
</div>
{% endfor %}
</div>
</div>
</div>
</div>
</div>
</div>
{% endif %}
</div>
</div>
<div
class=
"panel panel-default"
>
<div
class=
"panel-heading"
>
...
...
templates/pages/menu.html
View file @
0ab4f20a
...
...
@@ -367,7 +367,7 @@
<p>
Gargantext
<span
class=
"glyphicon glyphicon-registration-mark"
aria-hidden=
"true"
></span>
, version 3.0.6.
6
,
, version 3.0.6.
8
,
<a
href=
"http://www.cnrs.fr"
target=
"blank"
title=
"Institution that enables this project."
>
Copyrights
<span
class=
"glyphicon glyphicon-copyright-mark"
aria-hidden=
"true"
></span>
...
...
templates/pages/projects/modals.tpl
View file @
0ab4f20a
...
...
@@ -86,12 +86,12 @@
<button
type=
"button"
class=
"close"
data-dismiss=
"modal"
aria-label=
"Close"
>
<span
aria-hidden=
"true"
>
×
</span>
</button>
<h2
class=
"modal-title"
><h2><span
class=
"glyphicon glyphicon-info-sign"
aria-hidden=
"true"
></span>
Uploa
ding corpus...
</h2>
<h2
class=
"modal-title"
><h2><span
class=
"glyphicon glyphicon-info-sign"
aria-hidden=
"true"
></span>
Buil
ding corpus...
</h2>
</div>
<div
class=
"modal-body"
>
<h5>
Your file has been uploaded !
Gargantext
need some time to eat it.
Gargantext is gathering your texts
and
need some time to eat it.
Duration depends on the size of the dish.
</h5>
</div>
...
...
templates/pages/projects/moissonneurs.js
View file @
0ab4f20a
...
...
@@ -209,9 +209,11 @@
function
CustomForSelect
(
selected
)
{
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
selected
=
selected
.
toLowerCase
()
var
is_pubmed
=
(
selected
.
indexOf
(
'pubmed'
)
!=
-
1
);
var
is_istex
=
(
selected
.
indexOf
(
'istex'
)
!=
-
1
);
if
(
is_pubmed
||
is_istex
)
{
var
is_pubmed
=
(
selected
.
indexOf
(
'pubmed'
)
!=
-
1
);
var
is_istex
=
(
selected
.
indexOf
(
'istex'
)
!=
-
1
);
var
is_repec
=
(
selected
.
indexOf
(
'repec'
)
!=
-
1
);
if
(
is_pubmed
||
is_istex
||
is_repec
)
{
// if(selected=="pubmed") {
console
.
log
(
"show the button for: "
+
selected
)
$
(
"#pubmedcrawl"
).
css
(
"visibility"
,
"visible"
);
...
...
templates/pages/projects/project.html
View file @
0ab4f20a
This diff is collapsed.
Click to expand it.
templates/pages/projects/wait.html
View file @
0ab4f20a
...
...
@@ -199,12 +199,12 @@
<button
type=
"button"
class=
"close"
data-dismiss=
"modal"
aria-label=
"Close"
>
<span
aria-hidden=
"true"
>
×
</span>
</button>
<h2
class=
"modal-title"
><h2><span
class=
"glyphicon glyphicon-info-sign"
aria-hidden=
"true"
></span>
Uploading
corpus...
</h2>
<h2
class=
"modal-title"
><h2><span
class=
"glyphicon glyphicon-info-sign"
aria-hidden=
"true"
></span>
Building the
corpus...
</h2>
</div>
<div
class=
"modal-body"
>
<p>
Your file has been uploaded !
Gargantext
need some time to eat it.
Gargantext is gathering your texts
and
need some time to eat it.
Duration depends on the size of the dish.
</p>
</div>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment