Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
d5652f48
Commit
d5652f48
authored
Jul 17, 2017
by
sim
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[REFACT] Clean file utilities API
parent
2bbecc32
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
66 additions
and
72 deletions
+66
-72
nodes.py
gargantext/models/nodes.py
+0
-1
HAL.py
gargantext/util/crawlers/HAL.py
+19
-19
ISIDORE.py
gargantext/util/crawlers/ISIDORE.py
+12
-12
MULTIVAC.py
gargantext/util/crawlers/MULTIVAC.py
+20
-20
files.py
gargantext/util/files.py
+8
-13
api.py
gargantext/views/api/api.py
+0
-1
projects.py
gargantext/views/api/projects.py
+3
-2
projects.py
gargantext/views/pages/projects.py
+2
-2
util.py
moissonneurs/util.py
+2
-2
No files found.
gargantext/models/nodes.py
View file @
d5652f48
from
gargantext.util.db
import
session
from
gargantext.util.files
import
upload
from
datetime
import
datetime
...
...
gargantext/util/crawlers/HAL.py
View file @
d5652f48
...
...
@@ -10,16 +10,16 @@ from ._Crawler import *
import
json
from
gargantext.constants
import
UPLOAD_DIRECTORY
from
math
import
trunc
from
gargantext.util.files
import
save
from
gargantext.util.files
import
file_
save
class
HalCrawler
(
Crawler
):
''' HAL API CLIENT'''
def
__init__
(
self
):
# Main EndPoints
self
.
BASE_URL
=
"https://api.archives-ouvertes.fr"
self
.
API_URL
=
"search"
# Final EndPoints
# TODO : Change endpoint according type of database
self
.
URL
=
self
.
BASE_URL
+
"/"
+
self
.
API_URL
...
...
@@ -49,7 +49,7 @@ class HalCrawler(Crawler):
"""
#, authUrl_s
#, type_s
wt
=
"json"
querystring
=
{
"q"
:
query
...
...
@@ -58,18 +58,18 @@ class HalCrawler(Crawler):
,
"fl"
:
fl
,
"wt"
:
wt
}
# Specify Headers
headers
=
{
"cache-control"
:
"no-cache"
}
# Do Request and get response
response
=
requests
.
request
(
"GET"
,
self
.
URL
,
headers
=
headers
,
params
=
querystring
)
#print(querystring)
# Validation : 200 if ok else raise Value
if
response
.
status_code
==
200
:
...
...
@@ -80,27 +80,27 @@ class HalCrawler(Crawler):
return
(
json
.
loads
(
response
.
content
.
decode
(
charset
)))
else
:
raise
ValueError
(
response
.
status_code
,
response
.
reason
)
def
scan_results
(
self
,
query
):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self
.
results_nb
=
0
total
=
(
self
.
_get
(
query
)
.
get
(
"response"
,
{})
.
get
(
"numFound"
,
0
)
)
self
.
results_nb
=
total
return
self
.
results_nb
def
download
(
self
,
query
):
downloaded
=
False
self
.
status
.
append
(
"fetching results"
)
corpus
=
[]
...
...
@@ -114,7 +114,7 @@ class HalCrawler(Crawler):
)
print
(
"ERROR (scrap: Multivac d/l ): "
,
msg
)
self
.
query_max
=
QUERY_SIZE_N_MAX
#for page in range(1, trunc(self.query_max / 100) + 2):
for
page
in
range
(
0
,
self
.
query_max
,
paging
):
print
(
"Downloading page
%
s to
%
s results"
%
(
page
,
paging
))
...
...
@@ -126,10 +126,10 @@ class HalCrawler(Crawler):
for
doc
in
docs
:
corpus
.
append
(
doc
)
self
.
path
=
save
(
json
.
dumps
(
corpus
)
.
encode
(
"utf-8"
)
,
name
=
'HAL.json'
,
basedir
=
UPLOAD_DIRECTORY
)
self
.
path
=
file_
save
(
json
.
dumps
(
corpus
)
.
encode
(
"utf-8"
)
,
name
=
'HAL.json'
,
basedir
=
UPLOAD_DIRECTORY
)
downloaded
=
True
return
downloaded
gargantext/util/crawlers/ISIDORE.py
View file @
d5652f48
...
...
@@ -10,18 +10,18 @@ from ._Crawler import *
import
json
from
gargantext.constants
import
UPLOAD_DIRECTORY
from
math
import
trunc
from
gargantext.util.files
import
save
from
gargantext.util.files
import
file_
save
from
gargantext.util.crawlers.sparql.bool2sparql
import
bool2sparql
,
isidore
class
IsidoreCrawler
(
Crawler
):
''' ISIDORE SPARQL API CLIENT'''
def
__init__
(
self
):
# Main EndPoints
self
.
BASE_URL
=
"https://www.rechercheisidore.fr"
self
.
API_URL
=
"sparql"
# Final EndPoints
# TODO : Change endpoint according type of database
self
.
URL
=
self
.
BASE_URL
+
"/"
+
self
.
API_URL
...
...
@@ -35,7 +35,7 @@ class IsidoreCrawler(Crawler):
def
_get
(
self
,
query
,
offset
=
0
,
limit
=
None
,
lang
=
None
):
'''Parameters to download data'''
isidore
(
query
,
count
=
False
,
offset
=
offset
,
limit
=
limit
)
def
scan_results
(
self
,
query
):
...
...
@@ -47,9 +47,9 @@ class IsidoreCrawler(Crawler):
return
self
.
results_nb
def
download
(
self
,
query
):
downloaded
=
False
self
.
status
.
append
(
"fetching results"
)
corpus
=
[]
...
...
@@ -63,17 +63,17 @@ class IsidoreCrawler(Crawler):
)
print
(
"WARNING (scrap: ISIDORE d/l ): "
,
msg
)
self
.
query_max
=
QUERY_SIZE_N_MAX
for
offset
in
range
(
0
,
self
.
query_max
,
limit
):
print
(
"Downloading result
%
s to
%
s"
%
(
offset
,
self
.
query_max
))
for
doc
in
isidore
(
query
,
offset
=
offset
,
limit
=
limit
)
:
corpus
.
append
(
doc
)
self
.
path
=
save
(
json
.
dumps
(
corpus
)
.
encode
(
"utf-8"
)
,
name
=
'ISIDORE.json'
,
basedir
=
UPLOAD_DIRECTORY
)
self
.
path
=
file_
save
(
json
.
dumps
(
corpus
)
.
encode
(
"utf-8"
)
,
name
=
'ISIDORE.json'
,
basedir
=
UPLOAD_DIRECTORY
)
downloaded
=
True
return
downloaded
gargantext/util/crawlers/MULTIVAC.py
View file @
d5652f48
...
...
@@ -11,18 +11,18 @@ import json
from
gargantext.settings
import
API_TOKENS
from
gargantext.constants
import
UPLOAD_DIRECTORY
from
math
import
trunc
from
gargantext.util.files
import
save
from
gargantext.util.files
import
file_
save
class
MultivacCrawler
(
Crawler
):
''' Multivac API CLIENT'''
def
__init__
(
self
):
self
.
apikey
=
API_TOKENS
[
"MULTIVAC"
]
# Main EndPoints
self
.
BASE_URL
=
"https://api.iscpif.fr/v2"
self
.
API_URL
=
"pvt/economy/repec/search"
# Final EndPoints
# TODO : Change endpoint according type of database
self
.
URL
=
self
.
BASE_URL
+
"/"
+
self
.
API_URL
...
...
@@ -39,21 +39,21 @@ class MultivacCrawler(Crawler):
,
"from"
:
fromPage
,
"api_key"
:
API_TOKENS
[
"MULTIVAC"
][
"APIKEY"
]
}
if
lang
is
not
None
:
querystring
[
"lang"
]
=
lang
# Specify Headers
headers
=
{
"cache-control"
:
"no-cache"
}
# Do Request and get response
response
=
requests
.
request
(
"GET"
,
self
.
URL
,
headers
=
headers
,
params
=
querystring
)
#print(querystring)
# Validation : 200 if ok else raise Value
if
response
.
status_code
==
200
:
...
...
@@ -64,27 +64,27 @@ class MultivacCrawler(Crawler):
return
(
json
.
loads
(
response
.
content
.
decode
(
charset
)))
else
:
raise
ValueError
(
response
.
status_code
,
response
.
reason
)
def
scan_results
(
self
,
query
):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self
.
results_nb
=
0
total
=
(
self
.
_get
(
query
)
.
get
(
"results"
,
{})
.
get
(
"total"
,
0
)
)
self
.
results_nb
=
total
return
self
.
results_nb
def
download
(
self
,
query
):
downloaded
=
False
self
.
status
.
append
(
"fetching results"
)
corpus
=
[]
...
...
@@ -98,7 +98,7 @@ class MultivacCrawler(Crawler):
)
print
(
"ERROR (scrap: Multivac d/l ): "
,
msg
)
self
.
query_max
=
QUERY_SIZE_N_MAX
for
page
in
range
(
1
,
trunc
(
self
.
query_max
/
100
)
+
2
):
print
(
"Downloading page
%
s to
%
s results"
%
(
page
,
paging
))
docs
=
(
self
.
_get
(
query
,
fromPage
=
page
,
count
=
paging
)
...
...
@@ -109,10 +109,10 @@ class MultivacCrawler(Crawler):
for
doc
in
docs
:
corpus
.
append
(
doc
)
self
.
path
=
save
(
json
.
dumps
(
corpus
)
.
encode
(
"utf-8"
)
,
name
=
'Multivac.json'
,
basedir
=
UPLOAD_DIRECTORY
)
self
.
path
=
file_
save
(
json
.
dumps
(
corpus
)
.
encode
(
"utf-8"
)
,
name
=
'Multivac.json'
,
basedir
=
UPLOAD_DIRECTORY
)
downloaded
=
True
return
downloaded
gargantext/util/files.py
View file @
d5652f48
from
gargantext.constants
import
*
import
os
from
gargantext.constants
import
DOWNLOAD_DIRECTORY
,
UPLOAD_LIMIT
,
UPLOAD_DIRECTORY
from
gargantext.util.digest
import
str_digest
from
gargantext.util
import
http
def
save
(
contents
,
name
=
''
,
basedir
=
''
):
def
file_
save
(
contents
,
name
=
''
,
basedir
=
''
):
digest
=
str_digest
(
contents
[:
4096
]
+
contents
[
-
4096
:])
path
=
basedir
for
i
in
range
(
2
,
8
,
2
):
...
...
@@ -16,29 +18,22 @@ def save(contents, name='', basedir=''):
return
path
def
download
(
url
,
name
=
''
):
return
save
(
def
file_
download
(
url
,
name
=
''
):
return
file_
save
(
contents
=
http
.
get
(
url
),
name
=
name
,
basedir
=
DOWNLOAD_DIRECTORY
,
)
def
check_format
(
corpus_type
,
name
):
#~ if True:
acc_formats
=
RESOURCETYPES
[
corpus_type
][
"accepted_formats"
]
if
name
.
split
(
"."
)[
-
1
]
.
lower
()
not
in
acc_formats
:
raise
TypeError
(
'Uncorrect format of file. File must be a
%
s file'
%
" or "
.
join
(
acc_formats
))
def
upload
(
uploaded
):
def
file_upload
(
uploaded
):
if
uploaded
.
size
>
UPLOAD_LIMIT
:
raise
IOError
(
'Uploaded file is bigger than allowed:
%
d >
%
d'
%
(
uploaded
.
size
,
UPLOAD_LIMIT
,
))
return
save
(
return
file_
save
(
contents
=
uploaded
.
file
.
read
(),
name
=
uploaded
.
name
,
basedir
=
UPLOAD_DIRECTORY
,
...
...
gargantext/views/api/api.py
View file @
d5652f48
...
...
@@ -10,7 +10,6 @@ from gargantext.constants import RESOURCETYPES, NODETYPES, get_resource
from
gargantext.models
import
Node
,
Ngram
,
NodeNgram
,
NodeNodeNgram
,
NodeNode
from
gargantext.util.db
import
session
,
delete
,
func
,
bulk_insert
from
gargantext.util.db_cache
import
cache
,
or_
from
gargantext.util.files
import
upload
from
gargantext.util.http
import
ValidationException
,
APIView
,
JsonHttpResponse
,
get_parameters
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.validation
import
validate
...
...
gargantext/views/api/projects.py
View file @
d5652f48
...
...
@@ -5,6 +5,7 @@ from collections import defaultdict
from
gargantext.util.toolchain
import
*
import
copy
from
gargantext.util.db
import
session
from
gargantext.util.files
import
file_upload
class
ProjectList
(
APIView
):
'''API endpoint that represent a list of projects owned by a user'''
...
...
@@ -237,7 +238,7 @@ class ProjectView(APIView):
parent_id
=
corpus
.
id
,
hyperdata
=
{
"type"
:
source
[
"type"
],
"method"
:
method
,
"file"
:
upload
(
corpus_file
),
"file"
:
file_
upload
(
corpus_file
),
"query"
:
None
}
)
session
.
add
(
resource
)
...
...
@@ -485,7 +486,7 @@ class ProjectView(APIView):
#corpus_name = form["name"],
)
resource
.
method
=
form
[
"method"
]
resource
.
path
=
upload
(
form
[
'file'
])
resource
.
path
=
file_
upload
(
form
[
'file'
])
#mapping the default attribute of a given source from constant RESOURCETYPE
for
k
,
v
in
get_resource
(
int
(
form
[
"source"
]))
.
items
():
setattr
(
resource
,
k
,
v
)
...
...
gargantext/views/pages/projects.py
View file @
d5652f48
from
gargantext.util.http
import
*
from
gargantext.util.db
import
*
from
gargantext.util.db_cache
import
cache
from
gargantext.util.files
import
upload
from
gargantext.util.files
import
file_
upload
from
gargantext.models
import
*
from
gargantext.constants
import
*
from
.main
import
get_user_params
...
...
@@ -124,7 +124,7 @@ def project(request, project_id):
)
corpus
.
add_resource
(
type
=
int
(
request
.
POST
[
'type'
]),
path
=
upload
(
request
.
FILES
[
'file'
]),
path
=
file_
upload
(
request
.
FILES
[
'file'
]),
)
session
.
add
(
corpus
)
session
.
commit
()
...
...
moissonneurs/util.py
View file @
d5652f48
from
gargantext.util.files
import
download
from
gargantext.util.files
import
file_
download
import
sys
import
time
...
...
@@ -98,7 +98,7 @@ class Scraper :
# generic!
def
download
(
self
,
url
):
print
(
url
)
filename
=
download
(
url
)
filename
=
file_
download
(
url
)
with
self
.
lock
:
print
(
threading
.
current_thread
()
.
name
,
filename
+
" OK"
)
return
filename
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment