Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
d0125723
Commit
d0125723
authored
May 27, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIX] Merge ready for unstable.
parent
09535a2c
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
102 additions
and
122 deletions
+102
-122
constants.py
gargantext/constants.py
+43
-55
nodes.py
gargantext/models/nodes.py
+3
-7
http.py
gargantext/util/http.py
+0
-1
__init__.py
gargantext/util/taggers/__init__.py
+3
-3
projects.py
gargantext/views/pages/projects.py
+47
-50
terms.py
gargantext/views/pages/terms.py
+2
-2
istex.py
moissonneurs/istex.py
+2
-2
pubmed.py
moissonneurs/pubmed.py
+2
-2
No files found.
gargantext/constants.py
View file @
d0125723
...
...
@@ -112,18 +112,17 @@ INDEXED_HYPERDATA = {
}
#
from gargantext.util.taggers import FrenchMeltTagger, TurboTagger
from
gargantext.util.taggers
import
NltkTagger
from
gargantext.util.taggers
import
FrenchMeltTagger
,
TurboTagger
LANGUAGES
=
{
'en'
:
{
#'tagger': EnglishMeltTagger,
#
'tagger': TurboTagger,
'tagger'
:
NltkTagger
,
'tagger'
:
TurboTagger
,
#
'tagger': NltkTagger,
},
'fr'
:
{
#
'tagger': FrenchMeltTagger,
'tagger'
:
FrenchMeltTagger
,
# 'tagger': TreeTagger,
'tagger'
:
NltkTagger
,
},
}
...
...
@@ -131,96 +130,85 @@ LANGUAGES = {
from
gargantext.util.parsers
import
\
EuropressParser
,
RISParser
,
PubmedParser
,
ISIParser
,
CSVParser
,
ISTexParser
,
CernParser
#from gargantext.util.scrappers import \
# CernScraper
def
get_resource
(
corpus_type
):
'''get ressources values for a given ressource_type id'''
for
n
in
RESOURCETYPES
:
if
n
[
"type"
]
==
corpus_type
:
return
n
def
resourcetype
(
name
):
'''
resourcetype :: String -> Int
Usage : resourcetype("Europress (English)") == 1
Examples in scrapers scripts (Pubmed or ISTex for instance).
'''
return
[
n
[
0
]
for
n
in
enumerate
(
r
[
'name'
]
for
r
in
RESOURCETYPES
)
if
n
[
1
]
==
name
][
0
]
def
resourcename
(
corpus
):
'''
resourcetype :: Corpus -> String
Usage : resourcename(corpus) == "ISTex"
'''
resource
=
corpus
.
resources
()[
0
]
resourcename
=
RESOURCETYPES
[
resource
[
'type'
]][
'name'
]
return
re
.
sub
(
r'\(.*'
,
''
,
resourcename
)
RESOURCETYPES
=
[
# type 0
{
'type'
:
0
,
'name'
:
'Select database below'
,
{
'name'
:
'Select database below'
,
'parser'
:
None
,
'default_language'
:
None
,
},
# type 1
{
'type'
:
1
,
'name'
:
'Europress (English)'
,
{
'name'
:
'Europress (English)'
,
'parser'
:
EuropressParser
,
'default_language'
:
'en'
,
'accepted_formats'
:[
"zip"
,],
},
# type 2
{
'type'
:
2
,
'name'
:
'Europress (French)'
,
{
'name'
:
'Europress (French)'
,
'parser'
:
EuropressParser
,
'default_language'
:
'fr'
,
'accepted_formats'
:[
"zip"
,],
},
# type 3
{
'type'
:
3
,
'name'
:
'Jstor (RIS format)'
,
{
'name'
:
'Jstor (RIS format)'
,
'parser'
:
RISParser
,
'default_language'
:
'en'
,
'accepted_formats'
:[
"zip"
,],
},
# type 4
{
'type'
:
4
,
'name'
:
'Pubmed (XML format)'
,
{
'name'
:
'Pubmed (XML format)'
,
'parser'
:
PubmedParser
,
'default_language'
:
'en'
,
'accepted_formats'
:[
"zip"
,],
},
# type 5
{
'type'
:
5
,
'name'
:
'Scopus (RIS format)'
,
{
'name'
:
'Scopus (RIS format)'
,
'parser'
:
RISParser
,
'default_language'
:
'en'
,
'accepted_formats'
:[
"zip"
,],
},
# type 6
{
'type'
:
6
,
'name'
:
'Web of Science (ISI format)'
,
{
'name'
:
'Web of Science (ISI format)'
,
'parser'
:
ISIParser
,
'default_language'
:
'en'
,
'accepted_formats'
:[
"zip"
,],
},
# type 7
{
'type'
:
7
,
'name'
:
'Zotero (RIS format)'
,
{
'name'
:
'Zotero (RIS format)'
,
'parser'
:
RISParser
,
'default_language'
:
'en'
,
'accepted_formats'
:[
"zip"
,],
},
# type 8
{
'type'
:
8
,
'name'
:
'CSV'
,
{
'name'
:
'CSV'
,
'parser'
:
CSVParser
,
'default_language'
:
'en'
,
'accepted_formats'
:[
"csv"
],
},
# type 9
{
"type"
:
9
,
'name'
:
'ISTex'
,
{
'name'
:
'ISTex'
,
'parser'
:
ISTexParser
,
'default_language'
:
'en'
,
'accepted_formats'
:[
"zip"
,],
},
{
"type"
:
10
,
"name"
:
'SCOAP (XML MARC21 Format)'
,
"parser"
:
CernParser
,
"default_language"
:
"en"
,
'accepted_formats'
:[
"zip"
,
"xml"
]
,
#~ "scrapper": CernScrapper
,
#~ "base_url": "http://api.scoap3.org/search?"
,
}
,
]
# type 10
{
"type"
:
10
,
"name"
:
'SCOAP (XML MARC21 Format)'
,
"parser"
:
CernParser
,
"default_language"
:
"en"
,
'accepted_formats'
:[
"zip"
,
"xml"
]
,
#~ "scrapper": CernScrapper
,
#~ "base_url": "http://api.scoap3.org/search?"
,
},
]
# linguistic extraction parameters ---------------------------------------------
DEFAULT_RANK_CUTOFF_RATIO
=
.75
# MAINLIST maximum terms in %
...
...
@@ -246,8 +234,8 @@ DEFAULT_ALL_LOWERCASE_FLAG = True # lowercase ngrams before recording
# occurring at sentence beginning)
# ------------------------------------------------------------------------------
# other parameters
# other parameters
# default number of docs POSTed to scrappers.views.py
# (at page project > add a corpus > scan/process sample)
QUERY_SIZE_N_DEFAULT
=
1000
...
...
@@ -257,7 +245,7 @@ from .settings import BASE_DIR
# uploads/.gitignore prevents corpora indexing
# copora can be either a folder or symlink towards specific partition
UPLOAD_DIRECTORY
=
os
.
path
.
join
(
BASE_DIR
,
'uploads/corpora'
)
UPLOAD_LIMIT
=
1024
*
1024
*
1024
UPLOAD_LIMIT
=
1024
*
1024
*
1024
DOWNLOAD_DIRECTORY
=
UPLOAD_DIRECTORY
...
...
gargantext/models/nodes.py
View file @
d0125723
...
...
@@ -110,7 +110,6 @@ class Node(Base):
if
order
is
not
None
:
query
=
query
.
order_by
(
Node
.
name
)
return
query
def
add_child
(
self
,
**
kwargs
):
...
...
@@ -136,7 +135,7 @@ class Node(Base):
self
[
'resources'
]
=
MutableList
()
return
self
[
'resources'
]
def
add_resource
(
self
,
type
,
path
=
None
,
url
=
None
,
**
kwargs
):
def
add_resource
(
self
,
type
,
path
=
None
,
url
=
None
):
"""Attach a resource to a given node.
Mainly used for corpora.
...
...
@@ -146,13 +145,10 @@ class Node(Base):
{'extracted': True,
'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Français-2015-12-11.zip',
'type': 1,
'url': None,
'status':
'status_message':
}
'url': None}
"""
self
.
resources
()
.
append
(
MutableDict
(
{
'type'
:
type
,
'path'
:
path
,
'url'
:
url
,
'extracted'
:
False
,
**
kwargs
}
{
'type'
:
type
,
'path'
:
path
,
'url'
:
url
,
'extracted'
:
False
}
))
def
status
(
self
,
action
=
None
,
progress
=
0
,
complete
=
False
,
error
=
None
):
...
...
gargantext/util/http.py
View file @
d0125723
...
...
@@ -16,7 +16,6 @@ def requires_auth(func):
Also passes the URL to redirect towards as a GET parameter.
"""
def
_requires_auth
(
request
,
*
args
,
**
kwargs
):
#print(request.user.is_authenticated())
if
not
request
.
user
.
is_authenticated
():
url
=
'/auth/login/?next=
%
s'
%
urlencode
(
request
.
path
)
return
redirect
(
url
)
...
...
gargantext/util/taggers/__init__.py
View file @
d0125723
#
from .TurboTagger import TurboTagger
from
.TurboTagger
import
TurboTagger
from
.NltkTagger
import
NltkTagger
#
from .TreeTagger import TreeTagger
#
from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
from
.TreeTagger
import
TreeTagger
from
.MeltTagger
import
EnglishMeltTagger
,
FrenchMeltTagger
gargantext/views/pages/projects.py
View file @
d0125723
...
...
@@ -2,12 +2,11 @@ from gargantext.util.http import *
from
gargantext.util.db
import
*
from
gargantext.util.db_cache
import
cache
from
gargantext.util.files
import
upload
from
gargantext.util.files
import
check_format
from
gargantext.models
import
*
from
gargantext.constants
import
*
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.toolchain
import
parse_extract_indexhyperdata
from
gargantext.util.toolchain
import
add_corpus
from
datetime
import
datetime
from
collections
import
defaultdict
...
...
@@ -18,7 +17,7 @@ import re
@
requires_auth
def
overview
(
request
):
'''This view show all projects for a given user.
Each project is described with hyperdata that are updated on each following view.
Each project is described with hyperdata that are updated
ed
on each following view.
To each project, we can link a resource that can be an image.
'''
...
...
@@ -60,25 +59,17 @@ def overview(request):
class
NewCorpusForm
(
forms
.
Form
):
'''OK: add corpus Form (NIY)'''
type
=
forms
.
ChoiceField
(
choices
=
enumerate
(
resource_type
[
'name'
]
for
resource_type
in
RESOURCETYPES
),
widget
=
forms
.
Select
(
attrs
=
{
'onchange'
:
'CustomForSelect( $("option:selected", this).text() );'
})
)
name
=
forms
.
CharField
(
label
=
'Name'
,
max_length
=
199
,
widget
=
forms
.
TextInput
(
attrs
=
{
'required'
:
'true'
}))
file
=
forms
.
FileField
()
def
clean_file
(
self
):
file_
=
self
.
cleaned_data
.
get
(
'file'
)
if
len
(
file_
)
>
UPLOAD_LIMIT
:
# we don't accept more than 1GB
if
len
(
file_
)
>
1024
**
3
:
# we don't accept more than 1GB
raise
forms
.
ValidationError
(
ugettext_lazy
(
'File too heavy! (>1GB).'
))
return
file_
def
check_filename
(
self
):
print
(
self
.
cleaned_data
)
print
(
self
.
cleaned_data
.
get
(
"file"
)
.
split
(
"."
)[
-
1
])
#if self.cleaned_data.get("file").split(".")[-1] not in RESSOURCETYPES[choices]
#print RESOURCETYPES[self.cleaned_data.get("
pass
@
requires_auth
...
...
@@ -92,55 +83,61 @@ def project(request, project_id):
if
not
user
.
owns
(
project
):
raise
HttpResponseForbidden
()
#
add a new corpus into Node Project > Node Corpus > Ressource
#
new corpus
if
request
.
method
==
'POST'
:
corpus
=
add_corpus
(
request
,
project
)
if
corpus
.
status
:
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled
(
parse_extract_indexhyperdata
)(
corpus
.
id
)
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
context
=
{
corpus
=
project
.
add_child
(
name
=
request
.
POST
[
'name'
],
typename
=
'CORPUS'
,
)
corpus
.
add_resource
(
type
=
int
(
request
.
POST
[
'type'
]),
path
=
upload
(
request
.
FILES
[
'file'
]),
)
session
.
add
(
corpus
)
session
.
commit
()
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled
(
parse_extract_indexhyperdata
)(
corpus
.
id
)
return
render
(
template_name
=
'pages/projects/wait.html'
,
request
=
request
,
context
=
{
'user'
:
request
.
user
,
'project'
:
project
,
},
)
},
)
#
list all the
corpora within this project
# corpora within this project
corpora
=
project
.
children
(
'CORPUS'
,
order
=
True
)
.
all
()
#print(corpora)
sourcename2corpora
=
defaultdict
(
list
)
for
corpus
in
corpora
:
# we only consider the first resource of the corpus to determine its type
resources
=
corpus
.
resources
()
if
len
(
resources
)
>
0
:
if
len
(
resources
):
resource
=
resources
[
0
]
resource
=
get_resource
(
resource
[
"type"
])
##here map from RESSOURCES_TYPES_ID and NOT NAME
resource_type_name
=
resource
[
'name'
]
resource_type_accepted_formats
=
resource
[
'accepted_formats'
]
# add some data for the viewer
corpus
.
count
=
corpus
.
children
(
'DOCUMENT'
)
.
count
()
status
=
corpus
.
status
()
if
status
is
not
None
and
not
status
[
'complete'
]:
if
not
status
[
'error'
]:
corpus
.
status_message
=
'(in progress:
%
s,
%
d complete)'
%
(
status
[
'action'
]
.
replace
(
'_'
,
' '
),
status
[
'progress'
],
)
else
:
corpus
.
status_message
=
'(aborted: "
%
s" after
%
i docs)'
%
(
status
[
'error'
][
-
1
],
status
[
'progress'
]
)
resource_type_name
=
RESOURCETYPES
[
resource
[
'type'
]][
'name'
]
else
:
print
(
"(WARNING) PROJECT view: no listed resource"
)
# add some data for the viewer
corpus
.
count
=
corpus
.
children
(
'DOCUMENT'
)
.
count
()
status
=
corpus
.
status
()
if
status
is
not
None
and
not
status
[
'complete'
]:
if
not
status
[
'error'
]:
corpus
.
status_message
=
'(in progress:
%
s,
%
d complete)'
%
(
status
[
'action'
]
.
replace
(
'_'
,
' '
),
status
[
'progress'
],
)
else
:
corpus
.
status_message
=
''
# add
sourcename2corpora
[
resource_type_name
]
.
append
(
corpus
)
corpus
.
status_message
=
'(aborted: "
%
s" after
%
i docs)'
%
(
status
[
'error'
][
-
1
],
status
[
'progress'
]
)
else
:
corpus
.
status_message
=
''
# add
sourcename2corpora
[
resource_type_name
]
.
append
(
corpus
)
# source & their respective counts
total_documentscount
=
0
sourcename2documentscount
=
defaultdict
(
int
)
...
...
gargantext/views/pages/terms.py
View file @
d0125723
...
...
@@ -2,7 +2,7 @@ from gargantext.util.http import requires_auth, render, settings
from
gargantext.util.db
import
session
from
gargantext.util.db_cache
import
cache
from
gargantext.models
import
Node
from
gargantext.constants
import
get_resourc
e
from
gargantext.constants
import
resourcenam
e
from
datetime
import
datetime
@
requires_auth
...
...
@@ -32,7 +32,7 @@ def ngramtable(request, project_id, corpus_id):
'date'
:
datetime
.
now
(),
'project'
:
project
,
'corpus'
:
corpus
,
'resourcename'
:
get_ressource
(
corpus
)[
"name"
]
,
'resourcename'
:
resourcename
(
corpus
)
,
'view'
:
'terms'
},
)
moissonneurs/istex.py
View file @
d0125723
...
...
@@ -8,7 +8,7 @@ from traceback import print_tb
from
django.shortcuts
import
redirect
,
render
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
QUERY_SIZE_N_MAX
from
gargantext.constants
import
resourcetype
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.http
import
JsonHttpResponse
...
...
@@ -133,7 +133,7 @@ def save(request , project_id):
if
filename
!=
False
:
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
9
type
=
resourcetype
(
'ISTex'
)
,
path
=
filename
)
dwnldsOK
+=
1
...
...
moissonneurs/pubmed.py
View file @
d0125723
...
...
@@ -18,7 +18,7 @@ from traceback import print_tb
from
django.shortcuts
import
redirect
from
django.http
import
Http404
,
HttpResponseRedirect
,
HttpResponseForbidden
from
gargantext.constants
import
get_resourc
e
,
QUERY_SIZE_N_MAX
from
gargantext.constants
import
resourcetyp
e
,
QUERY_SIZE_N_MAX
from
gargantext.models.nodes
import
Node
from
gargantext.util.db
import
session
from
gargantext.util.db_cache
import
cache
...
...
@@ -134,7 +134,7 @@ def save( request , project_id ) :
print
(
filename
)
if
filename
!=
False
:
# add the uploaded resource to the corpus
corpus
.
add_resource
(
type
=
4
corpus
.
add_resource
(
type
=
resourcetype
(
'Pubmed (XML format)'
)
,
path
=
filename
,
url
=
None
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment