Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
462c9ecc
Commit
462c9ecc
authored
May 11, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Integration de CERN [OK]:\n\t-Manque option search corpus\n\t-juste au stade send the job
parent
649f366b
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
94 additions
and
135 deletions
+94
-135
constants.py
gargantext/constants.py
+1
-1
Cern.py
gargantext/util/parsers/Cern.py
+7
-4
projects.py
gargantext/views/pages/projects.py
+13
-4
cern.py
moissonneurs/cern.py
+67
-121
project.html
templates/pages/projects/project.html
+6
-5
No files found.
gargantext/constants.py
View file @
462c9ecc
...
@@ -203,7 +203,7 @@ RESOURCETYPES = [
...
@@ -203,7 +203,7 @@ RESOURCETYPES = [
'accepted_formats'
:[
"zip"
,],
'accepted_formats'
:[
"zip"
,],
},
},
# type 10
# type 10
{
"name"
:
'
Cern (MARC21 XML
)'
,
{
"name"
:
'
SCOAP (XML MARC21 Format
)'
,
"parser"
:
CernParser
,
"parser"
:
CernParser
,
"default_language"
:
"en"
,
"default_language"
:
"en"
,
'accepted_formats'
:[
"zip"
,
"xml"
],
'accepted_formats'
:[
"zip"
,
"xml"
],
...
...
gargantext/util/parsers/Cern.py
View file @
462c9ecc
from
._Parser
import
Parser
from
._Parser
import
Parser
from
datetime
import
datetime
from
datetime
import
datetime
from
io
import
BytesIO
from
bs4
import
BeautifulSoup
#from io import BytesIO
from
io
import
StringIO
import
json
import
json
from
lxml
import
etree
class
CernParser
(
Parser
):
class
CernParser
(
Parser
):
MARC21
=
{
MARC21
=
{
...
@@ -34,8 +36,9 @@ class CernParser(Parser):
...
@@ -34,8 +36,9 @@ class CernParser(Parser):
}
}
def
parse
(
self
,
filebuf
):
def
parse
(
self
,
filebuf
):
tree
=
etree
.
tostring
(
filebuf
)
doc
=
etree
.
parse
(
filebuf
)
#root = tree.getroot()
tree
=
etree
.
tostring
(
doc
)
#parser = etree.XMLParser()
hyperdata_list
=
[]
hyperdata_list
=
[]
soup
=
BeautifulSoup
(
tree
,
"lxml"
)
soup
=
BeautifulSoup
(
tree
,
"lxml"
)
for
record
in
soup
.
find_all
(
"record"
):
for
record
in
soup
.
find_all
(
"record"
):
...
...
gargantext/views/pages/projects.py
View file @
462c9ecc
...
@@ -4,7 +4,6 @@ from gargantext.util.db_cache import cache
...
@@ -4,7 +4,6 @@ from gargantext.util.db_cache import cache
from
gargantext.util.files
import
upload
from
gargantext.util.files
import
upload
from
gargantext.models
import
*
from
gargantext.models
import
*
from
gargantext.constants
import
*
from
gargantext.constants
import
*
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
...
@@ -17,7 +16,7 @@ import re
...
@@ -17,7 +16,7 @@ import re
@
requires_auth
@
requires_auth
def
overview
(
request
):
def
overview
(
request
):
'''This view show all projects for a given user.
'''This view show all projects for a given user.
Each project is described with hyperdata that are updated
ed
on each following view.
Each project is described with hyperdata that are updated on each following view.
To each project, we can link a resource that can be an image.
To each project, we can link a resource that can be an image.
'''
'''
...
@@ -63,13 +62,20 @@ class NewCorpusForm(forms.Form):
...
@@ -63,13 +62,20 @@ class NewCorpusForm(forms.Form):
choices
=
enumerate
(
resource_type
[
'name'
]
for
resource_type
in
RESOURCETYPES
),
choices
=
enumerate
(
resource_type
[
'name'
]
for
resource_type
in
RESOURCETYPES
),
widget
=
forms
.
Select
(
attrs
=
{
'onchange'
:
'CustomForSelect( $("option:selected", this).text() );'
})
widget
=
forms
.
Select
(
attrs
=
{
'onchange'
:
'CustomForSelect( $("option:selected", this).text() );'
})
)
)
name
=
forms
.
CharField
(
label
=
'Name'
,
max_length
=
199
,
widget
=
forms
.
TextInput
(
attrs
=
{
'required'
:
'true'
}))
name
=
forms
.
CharField
(
label
=
'Name'
,
max_length
=
199
,
widget
=
forms
.
TextInput
(
attrs
=
{
'required'
:
'true'
}))
file
=
forms
.
FileField
()
file
=
forms
.
FileField
()
def
clean_file
(
self
):
def
clean_file
(
self
):
file_
=
self
.
cleaned_data
.
get
(
'file'
)
file_
=
self
.
cleaned_data
.
get
(
'file'
)
if
len
(
file_
)
>
1024
**
3
:
# we don't accept more than 1GB
if
len
(
file_
)
>
UPLOAD_LIMIT
:
# we don't accept more than 1GB
raise
forms
.
ValidationError
(
ugettext_lazy
(
'File too heavy! (>1GB).'
))
raise
forms
.
ValidationError
(
ugettext_lazy
(
'File too heavy! (>1GB).'
))
return
file_
return
file_
def
check_filename
(
self
):
print
(
self
.
cleaned_data
)
print
(
self
.
cleaned_data
.
get
(
"file"
)
.
split
(
"."
)[
-
1
])
#if self.cleaned_data.get("file").split(".")[-1] not in RESSOURCETYPES[choices]
#print RESOURCETYPES[self.cleaned_data.get("
pass
@
requires_auth
@
requires_auth
...
@@ -108,9 +114,9 @@ def project(request, project_id):
...
@@ -108,9 +114,9 @@ def project(request, project_id):
},
},
)
)
# corpora within this project
# corpora within this project
corpora
=
project
.
children
(
'CORPUS'
,
order
=
True
)
.
all
()
corpora
=
project
.
children
(
'CORPUS'
,
order
=
True
)
.
all
()
print
(
corpora
)
sourcename2corpora
=
defaultdict
(
list
)
sourcename2corpora
=
defaultdict
(
list
)
for
corpus
in
corpora
:
for
corpus
in
corpora
:
# we only consider the first resource of the corpus to determine its type
# we only consider the first resource of the corpus to determine its type
...
@@ -118,8 +124,11 @@ def project(request, project_id):
...
@@ -118,8 +124,11 @@ def project(request, project_id):
if
len
(
resources
):
if
len
(
resources
):
resource
=
resources
[
0
]
resource
=
resources
[
0
]
resource_type_name
=
RESOURCETYPES
[
resource
[
'type'
]][
'name'
]
resource_type_name
=
RESOURCETYPES
[
resource
[
'type'
]][
'name'
]
resource_type_accepted_formats
=
RESOURCETYPES
[
resource
[
'type'
]][
'accepted_formats'
]
else
:
else
:
print
(
"(WARNING) PROJECT view: no listed resource"
)
print
(
"(WARNING) PROJECT view: no listed resource"
)
print
(
"(DEBUG) PROJECT view: one of the corpus has an invalid type"
)
raise
Http404
(
"One of the corpus has an invalid type"
)
# add some data for the viewer
# add some data for the viewer
corpus
.
count
=
corpus
.
children
(
'DOCUMENT'
)
.
count
()
corpus
.
count
=
corpus
.
children
(
'DOCUMENT'
)
.
count
()
status
=
corpus
.
status
()
status
=
corpus
.
status
()
...
...
moissonneurs/cern.py
View file @
462c9ecc
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ****************************
# ***** CERN Scrapper *****
# ***** CERN Scrapper *****
# ****************************
# ****************************
import
logging
from
logging.handlers
import
RotatingFileHandler
# création de l'objet logger qui va nous servir à écrire dans les logs
logger
=
logging
.
getLogger
()
# on met le niveau du logger à DEBUG, comme ça il écrit tout
logger
.
setLevel
(
logging
.
DEBUG
)
# création d'un formateur qui va ajouter le temps, le niveau
# de chaque message quand on écrira un message dans le log
formatter
=
logging
.
Formatter
(
'
%(asctime)
s ::
%(levelname)
s ::
%(message)
s'
)
# création d'un handler qui va rediriger une écriture du log vers
# un fichier en mode 'append', avec 1 backup et une taille max de 1Mo
#>>> Permission denied entre en conflit avec les los django
#file_handler = RotatingFileHandler('.activity.log', 'a', 1000000, 1)
# on lui met le niveau sur DEBUG, on lui dit qu'il doit utiliser le formateur
# créé précédement et on ajoute ce handler au logger
#~ file_handler.setLevel(logging.DEBUG)
#~ file_handler.setFormatter(formatter)
#~ logger.addHandler(file_handler)
# création d'un second handler qui va rediriger chaque écriture de log
# sur la console
steam_handler
=
logging
.
StreamHandler
()
steam_handler
.
setLevel
(
logging
.
DEBUG
)
logger
.
addHandler
(
steam_handler
)
import
json
import
json
import
datetime
import
datetime
from
os
import
path
from
os
import
path
...
@@ -20,11 +52,38 @@ from collections import defaultdict
...
@@ -20,11 +52,38 @@ from collections import defaultdict
from
gargantext.settings
import
API_TOKENS
as
API
from
gargantext.settings
import
API_TOKENS
as
API
#from private import API_PERMISSIONS
#from private import API_PERMISSIONS
API_TOKEN
=
API
[
"CERN"
]
def
save
(
request
,
project_id
)
:
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
first
()
if
project
is
None
:
raise
Http404
()
user
=
cache
.
User
[
request
.
user
.
id
]
if
not
user
.
owns
(
project
):
raise
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
name
=
request
.
POST
[
"string"
]
corpus
=
project
.
add_child
(
name
=
name
,
typename
=
"CORPUS"
)
corpus
.
add_resource
(
type
=
resourcetype
(
'Cern (MARC21 XML)'
)
,
path
=
filename
,
url
=
None
)
print
(
"Adding the resource"
)
def
query
(
request
):
def
query
(
request
):
print
(
request
.
method
)
print
(
request
.
method
)
alist
=
[]
alist
=
[]
if
request
.
method
==
"POST"
:
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
query
=
request
.
POST
[
"query"
]
N
=
int
(
request
.
POST
[
"N"
])
N
=
int
(
request
.
POST
[
"N"
])
...
@@ -36,129 +95,16 @@ def query( request ):
...
@@ -36,129 +95,16 @@ def query( request ):
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
#Here Requests API
#
#API_TOKEN = API["CERN"]
def
save
(
request
,
project_id
):
#instancia = Scraper()
print
(
"testCERN:"
)
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
# implicit global session
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
# serialFetcher (n_last_years, query, query_size)
project
=
(
session
#alist = instancia.serialFetcher( 5, query , N )
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
typename
==
'PROJECT'
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
data
=
alist
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/auth/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
"-"
query_string
=
"-"
N
=
0
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
query_string
=
query
.
replace
(
" "
,
"+"
)
# url encoded q
if
"N"
in
request
.
POST
:
N
=
int
(
request
.
POST
[
"N"
])
# query_size from views_opti
if
N
>
QUERY_SIZE_N_MAX
:
msg
=
"Invalid sample size N =
%
i (max =
%
i)"
%
(
N
,
QUERY_SIZE_N_MAX
)
print
(
"ERROR (scrap: istex d/l ): "
,
msg
)
raise
ValueError
(
msg
)
print
(
"Scrapping Istex: '
%
s' (
%
i)"
%
(
query_string
,
N
))
urlreqs
=
[]
pagesize
=
50
tasks
=
Scraper
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
query
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
typename
=
'CORPUS'
,
hyperdata
=
{
"action"
:
"Scrapping data"
,
"language_id"
:
None
}
)
tasks
=
Scraper
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
tasks
.
q
.
put
(
url
)
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
resourcetype
(
'ISTex'
)
,
path
=
filename
)
dwnldsOK
+=
1
session
.
add
(
corpus
)
session
.
commit
()
corpus_id
=
corpus
.
id
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
###########################
###########################
try
:
scheduled
(
parse_extract_indexhyperdata
)(
corpus_id
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
try
:
print_tb
(
error
.
__traceback__
)
except
:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session
.
rollback
()
# --------------------------------------------
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
context
=
{
'user'
:
request
.
user
,
'project'
:
project
,
},
)
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
return
JsonHttpResponse
(
data
)
...
...
templates/pages/projects/project.html
View file @
462c9ecc
...
@@ -98,7 +98,7 @@
...
@@ -98,7 +98,7 @@
<button
type=
"button"
class=
"btn btn-default"
data-container=
"body"
data-toggle=
"popover"
data-placement=
"bottom"
<button
type=
"button"
class=
"btn btn-default"
data-container=
"body"
data-toggle=
"popover"
data-placement=
"bottom"
data-content=
"
data-content=
"
<ul>
<ul>
<li
<li
onclick="
onclick="
garganrest.nodes.delete({{corpus.id}}, function(){$('#corpus_'+{{corpus.id}}).remove()});
garganrest.nodes.delete({{corpus.id}}, function(){$('#corpus_'+{{corpus.id}}).remove()});
$(this).parent().parent().remove();
$(this).parent().parent().remove();
...
@@ -142,9 +142,9 @@
...
@@ -142,9 +142,9 @@
</span>
</span>
</div>
</div>
{% endifequal %}
{% endifequal %}
{% ifequal state.action "ngrams_extraction" %}
{% ifequal state.action "ngrams_extraction" %}
<div
class=
"progress-bar progress-bar-striped
<div
class=
"progress-bar progress-bar-striped
{% if state.complete %}
{% if state.complete %}
progress-bar-success
progress-bar-success
{% else %}
{% else %}
...
@@ -162,7 +162,7 @@
...
@@ -162,7 +162,7 @@
</span>
</span>
</div>
</div>
{% endifequal %}
{% endifequal %}
{% endfor %}
{% endfor %}
</div>
</div>
{% endif %}
{% endif %}
...
@@ -483,7 +483,8 @@
...
@@ -483,7 +483,8 @@
selected
=
selected
.
toLowerCase
()
selected
=
selected
.
toLowerCase
()
var
is_pubmed
=
(
selected
.
indexOf
(
'pubmed'
)
!=
-
1
);
var
is_pubmed
=
(
selected
.
indexOf
(
'pubmed'
)
!=
-
1
);
var
is_istex
=
(
selected
.
indexOf
(
'istex'
)
!=
-
1
);
var
is_istex
=
(
selected
.
indexOf
(
'istex'
)
!=
-
1
);
if
(
is_pubmed
||
is_istex
)
{
var
is_cern
=
(
selected
.
indexOf
(
'istex'
)
!=
-
1
);
if
(
is_pubmed
||
is_istex
||
is_cern
)
{
// if(selected=="pubmed") {
// if(selected=="pubmed") {
console
.
log
(
"show the button for: "
+
selected
)
console
.
log
(
"show the button for: "
+
selected
)
$
(
"#pubmedcrawl"
).
css
(
"visibility"
,
"visible"
);
$
(
"#pubmedcrawl"
).
css
(
"visibility"
,
"visible"
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment