Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
2e8dd028
Commit
2e8dd028
authored
May 11, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Adding dependencies + refresh requirements.pip => [BUG Report]
parent
d19c6877
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
264 additions
and
0 deletions
+264
-0
requirements.pip
gargantext/requirements.pip
+33
-0
cern.py
moissonneurs/cern.py
+231
-0
No files found.
gargantext/requirements.pip
0 → 100644
View file @
2e8dd028
amqp==1.4.9
anyjson==0.3.3
beautifulsoup4==4.4.1
billiard==3.3.0.22
celery==3.1.20
chardet==2.3.0
dateparser==0.3.2
decorator==4.0.9
Django==1.9.2
django-celery==3.1.17
django-pgfields==1.4.4
django-pgjsonb==0.0.16
djangorestframework==3.3.2
html5lib==0.9999999
jdatetime==1.7.2
kombu==3.0.33
lxml==3.5.0
networkx==1.11
nltk==3.1
numpy==1.10.4
pandas==0.18.0
pkg-resources==0.0.0
psycopg2==2.6.1
pycountry==1.20
python-dateutil==2.4.2
pytz==2015.7
PyYAML==3.11
RandomWords==0.1.12
requests==2.10.0
six==1.10.0
SQLAlchemy==1.1.0b1.dev0
ujson==1.35
umalqurra==0.2
moissonneurs/cern.py
0 → 100644
View file @
2e8dd028
# ****************************
# ***** CERN Scrapper *****
# ****************************
import
json
import
datetime
from
os
import
path
import
threading
import
hmac
,
hashlib
import
requests
import
lxml
import
subprocess
import
urllib.parse
as
uparse
from
lxml
import
etree
from
bs4
import
BeautifulSoup
,
Comment
from
collections
import
defaultdict
#from gargantext.util.files import download
from
gargantext.settings
import
API_TOKENS
as
API
#from private import API_PERMISSIONS
API_TOKEN
=
API
[
"CERN"
]
def
query
(
request
):
print
(
request
.
method
)
alist
=
[]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR(scrap: pubmed stats): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
def
save
(
request
,
project_id
):
print
(
"testCERN:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/auth/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
"-"
query_string
=
"-"
N
=
0
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
pagesize
=
50
tasks
=
Scraper
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scrapping data"
,
"language_id"
:
None
}
)
tasks
=
Scraper
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
tasks
.
q
.
put
(
url
)
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
resourcetype
(
'ISTex'
)
,
path
=
filename
)
dwnldsOK
+=
1
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
###########################
###########################
try
:
scheduled
(
parse_extract_indexhyperdata
)(
corpus_id
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
try
:
print_tb
(
error
.
__traceback__
)
except
:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session
.
rollback
()
# --------------------------------------------
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
context
=
{
'user'
:
request
.
user
,
'project'
:
project
,
},
)
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
class
CERN_API
(
object
):
'''CERN SCOAP3 Interaction'''
def
__init__
(
self
,
query
,
filename
=
"./results.xml"
):
self
.
query
=
query
self
.
apikey
=
API
[
"TOKEN"
]
self
.
secret
=
API
[
"SECRET"
]
.
encode
(
"utf-8"
)
self
.
results
=
self
.
get_results
(
filename
)
self
.
BASE_URL
=
u"http://api.scoap3.org/search?"
def
__generate_signature__
(
self
,
url
):
'''creation de la signature'''
#hmac-sha1 salted with secret
return
hmac
.
new
(
self
.
secret
,
url
,
hashlib
.
sha1
)
.
hexdigest
()
def
__format_url__
(
self
):
'''format the url with encoded query'''
dict_q
=
uparse
.
parse_qs
(
self
.
query
)
#add the apikey
dict_q
[
"apikey"
]
=
[
self
.
apikey
]
params
=
"&"
.
join
([(
str
(
k
)
+
"="
+
str
(
uparse
.
quote
(
v
[
0
])))
for
k
,
v
in
sorted
(
dict_q
.
items
())])
return
self
.
BASE_URL
+
params
def
sign_url
(
self
):
'''add signature'''
url
=
self
.
__format_url__
()
return
url
+
"&signature="
+
self
.
__generate_signature__
(
url
.
encode
(
"utf-8"
))
def
get_results
(
self
,
filename
):
url
=
self
.
sign_url
()
r
=
requests
.
get
(
url
,
stream
=
True
)
with
open
(
filename
,
'wb'
)
as
f
:
for
chunk
in
r
.
iter_content
(
chunk_size
=
1024
):
if
chunk
:
# filter out keep-alive new chunks
f
.
write
(
chunk
)
return
filename
def
parse_xml
(
filename
,
MARCXML
):
parser
=
etree
.
XMLParser
()
with
open
(
self
.
filename
,
'r'
)
as
f
:
root
=
etree
.
tostring
(
f
.
read
())
data
=
f
.
read
()
records
=
[]
for
record
in
data
.
split
(
"<record>"
)[
1
:]:
soup
=
BeautifulSoup
(
"<record>"
+
record
,
"lxml"
)
r
=
{
v
:[]
for
v
in
self
.
MARC21
[
"700"
]
.
values
()}
r
[
"uid"
]
=
soup
.
find
(
"controlfield"
)
.
text
for
data
in
soup
.
find_all
(
"datafield"
):
tag
=
data
.
get
(
"tag"
)
if
tag
in
self
.
MARC21
.
keys
():
for
sub
in
data
.
find_all
(
"subfield"
):
code
=
sub
.
get
(
"code"
)
if
code
in
self
.
MARC21
[
tag
]
.
keys
():
if
tag
==
"700"
:
r
[
self
.
MARC21
[
tag
][
code
]]
.
append
(
sub
.
text
)
else
:
r
[
self
.
MARC21
[
tag
][
code
]]
=
sub
.
text
records
.
append
(
r
.
decode
(
'utf-8'
))
return
JsonHttpResponse
(
records
)
#query="of=xm"
#a = CERN_API(query, "./full.xml")
#p = CERNParser("./full.xml")
#print(p.MARC21.keys())
#~ #p.parse()
#~ with open("./results_full.json", "r") as f:
#~ data = json.load(f)
#~ for record in data["records"]:
#~ print(record.keys())
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment