Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
49552ff6
Commit
49552ff6
authored
Jul 28, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
PARSING with default_languages and skipped_docs added
parent
cd453144
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
132 additions
and
125 deletions
+132
-125
CERN.py
gargantext/util/parsers/CERN.py
+23
-18
_Tagger.py
gargantext/util/taggers/_Tagger.py
+5
-5
main.py
gargantext/util/toolchain/main.py
+2
-1
ngram_groups.py
gargantext/util/toolchain/ngram_groups.py
+17
-21
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+40
-41
parsing.py
gargantext/util/toolchain/parsing.py
+42
-33
corpora.py
gargantext/views/pages/corpora.py
+3
-6
No files found.
gargantext/util/parsers/CERN.py
View file @
49552ff6
...
@@ -40,26 +40,31 @@ class CernParser(Parser):
...
@@ -40,26 +40,31 @@ class CernParser(Parser):
"856"
:
{
"u"
:
"pdf_source"
},
"856"
:
{
"u"
:
"pdf_source"
},
}
}
def
format_date
(
self
,
hyperdata
):
# def format_date(self, hyperdata):
'''formatting pubdate'''
# '''formatting pubdate'''
prefix
=
"publication"
# prefix = "publication"
date
=
datetime
.
strptime
(
hyperdata
[
prefix
+
"_date"
],
"
%
Y-
%
m-
%
d"
)
# try:
#hyperdata[prefix + "_year"] = date.strftime('%Y')
# date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m-%d")
hyperdata
[
prefix
+
"_month"
]
=
date
.
strftime
(
"
%
m"
)
# except ValueError:
hyperdata
[
prefix
+
"_day"
]
=
date
.
strftime
(
"
%
d"
)
# date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m")
hyperdata
[
prefix
+
"_hour"
]
=
date
.
strftime
(
"
%
H"
)
# date.day = "01"
hyperdata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
M"
)
# hyperdata[prefix + "_year"] = date.strftime('%Y')
hyperdata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
S"
)
# hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata
[
prefix
+
"_date"
]
=
date
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
# hyperdata[prefix + "_day"] = date.strftime("%d")
print
(
"Date"
,
hyperdata
[
"publication_date"
])
#
return
hyperdata
# hyperdata[prefix + "_hour"] = date.strftime("%H")
# hyperdata[prefix + "_minute"] = date.strftime("%M")
# hyperdata[prefix + "_second"] = date.strftime("%S")
# hyperdata[prefix + "_date"] = date.strftime("%Y-%m-%d %H:%M:%S")
# #print("Date", hyperdata["publication_date"])
# return hyperdata
#@asyncio.coroutine
#@asyncio.coroutine
def
parse
(
self
,
file
):
def
parse
(
self
,
file
):
print
(
"PARSING"
)
#
print("PARSING")
hyperdata_list
=
[]
hyperdata_list
=
[]
doc
=
file
.
read
()
doc
=
file
.
read
()
print
(
doc
[:
35
])
#
print(doc[:35])
soup
=
BeautifulSoup
(
doc
,
"lxml"
)
soup
=
BeautifulSoup
(
doc
,
"lxml"
)
#print(soup.find("record"))
#print(soup.find("record"))
...
@@ -93,8 +98,8 @@ class CernParser(Parser):
...
@@ -93,8 +98,8 @@ class CernParser(Parser):
hyperdata
[
"authors_affiliations"
]
=
(
","
)
.
join
(
hyperdata
[
"authors_affiliations"
])
hyperdata
[
"authors_affiliations"
]
=
(
","
)
.
join
(
hyperdata
[
"authors_affiliations"
])
hyperdata
[
"authors"
]
=
(
","
)
.
join
(
hyperdata
[
"authors"
])
hyperdata
[
"authors"
]
=
(
","
)
.
join
(
hyperdata
[
"authors"
])
hyperdata
[
"authors_mails"
]
=
(
","
)
.
join
(
hyperdata
[
"authors_mails"
])
hyperdata
[
"authors_mails"
]
=
(
","
)
.
join
(
hyperdata
[
"authors_mails"
])
hyperdata
=
self
.
format_date
(
hyperdata
)
#hyperdata = self.format_date(hyperdata)
hyperdata
=
self
.
format_hyperdata_languages
(
hyperdata
)
hyperdata
=
self
.
format_hyperdata_dates
(
hyperdata
)
hyperdata_list
.
append
(
hyperdata
)
hyperdata_list
.
append
(
hyperdata
)
return
hyperdata_list
return
hyperdata_list
gargantext/util/taggers/_Tagger.py
View file @
49552ff6
...
@@ -9,7 +9,7 @@ import nltk
...
@@ -9,7 +9,7 @@ import nltk
class
Tagger
:
class
Tagger
:
def
__init__
(
self
,
text
):
def
__init__
(
self
):
# This regular expression is really good at tokenizing a text!
# This regular expression is really good at tokenizing a text!
self
.
_re_sentence
=
re
.
compile
(
r'''(?x) # set flag to allow verbose regexps
self
.
_re_sentence
=
re
.
compile
(
r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
...
@@ -19,18 +19,18 @@ class Tagger:
...
@@ -19,18 +19,18 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens
| [][.,;"'?!():-_`] # these are separate tokens
'''
,
re
.
UNICODE
|
re
.
MULTILINE
|
re
.
DOTALL
)
'''
,
re
.
UNICODE
|
re
.
MULTILINE
|
re
.
DOTALL
)
self
.
buffer
=
[]
self
.
buffer
=
[]
self
.
text
=
clean_text
(
text
)
self
.
start
()
#self.start()
def
clean_text
(
text
):
def
clean_text
(
self
,
text
):
"""Clean the text for better POS tagging.
"""Clean the text for better POS tagging.
For now, only removes (short) XML tags.
For now, only removes (short) XML tags.
"""
"""
return
re
.
sub
(
r'<[^>]{0,45}>'
,
''
,
text
)
return
re
.
sub
(
r'<[^>]{0,45}>'
,
''
,
text
)
def
extract
(
self
,
text
,
rule
=
RULE_JJNN
,
label
=
'NP'
,
max_n_words
=
DEFAULT_MAX_NGRAM_LEN
):
def
extract
(
self
,
text
,
rule
=
RULE_JJNN
,
label
=
'NP'
,
max_n_words
=
DEFAULT_MAX_NGRAM_LEN
):
text
=
self
.
clean_text
(
text
)
self
.
text
=
self
.
clean_text
(
text
)
grammar
=
nltk
.
RegexpParser
(
label
+
': '
+
rule
)
grammar
=
nltk
.
RegexpParser
(
label
+
': '
+
rule
)
tagged_tokens
=
list
(
self
.
tag_text
(
self
.
text
))
tagged_tokens
=
list
(
self
.
tag_text
(
self
.
text
))
if
len
(
tagged_tokens
):
if
len
(
tagged_tokens
):
...
...
gargantext/util/toolchain/main.py
View file @
49552ff6
...
@@ -82,6 +82,7 @@ def parse_extract_indexhyperdata(corpus):
...
@@ -82,6 +82,7 @@ def parse_extract_indexhyperdata(corpus):
favs
=
corpus
.
add_child
(
favs
=
corpus
.
add_child
(
typename
=
'FAVORITES'
,
name
=
'favorite docs in "
%
s"'
%
corpus
.
name
typename
=
'FAVORITES'
,
name
=
'favorite docs in "
%
s"'
%
corpus
.
name
)
)
session
.
add
(
favs
)
session
.
add
(
favs
)
session
.
commit
()
session
.
commit
()
print
(
'CORPUS #
%
d: [
%
s] new favorites node #
%
i'
%
(
corpus
.
id
,
t
(),
favs
.
id
))
print
(
'CORPUS #
%
d: [
%
s] new favorites node #
%
i'
%
(
corpus
.
id
,
t
(),
favs
.
id
))
...
...
gargantext/util/toolchain/ngram_groups.py
View file @
49552ff6
...
@@ -22,17 +22,13 @@ def prepare_stemmers(corpus):
...
@@ -22,17 +22,13 @@ def prepare_stemmers(corpus):
"""
"""
Returns *several* stemmers (one for each language in the corpus)
Returns *several* stemmers (one for each language in the corpus)
(as a dict of stemmers with key = language_iso2)
(as a dict of stemmers with key = language_iso2)
languages has been previously filtered by supported source languages
and formatted
"""
"""
stemmers_by_lg
=
{
stemmers
=
{
lang
:
SnowballStemmer
(
languages
[
lang
]
.
name
.
lower
())
for
lang
\
# always get a generic stemmer in case language code unknown
in
corpus
.
languages
.
keys
()
if
lang
!=
"__skipped__"
}
'__unknown__'
:
SnowballStemmer
(
"english"
)
stemmers
[
'__unknown__'
]
=
SnowballStemmer
(
"english"
)
}
return
stemmers
for
lang
in
corpus
.
languages
.
keys
():
print
(
lang
)
if
(
lang
!=
'__skipped__'
):
lgname
=
languages
[
lang
]
.
name
.
lower
()
stemmers_by_lg
[
lang
]
=
SnowballStemmer
(
lgname
)
return
stemmers_by_lg
def
compute_groups
(
corpus
,
stoplist_id
=
None
,
overwrite_id
=
None
):
def
compute_groups
(
corpus
,
stoplist_id
=
None
,
overwrite_id
=
None
):
"""
"""
...
@@ -40,7 +36,6 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
...
@@ -40,7 +36,6 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
"""
"""
print
(
corpus
.
languages
.
keys
())
stop_ngrams_ids
=
{}
stop_ngrams_ids
=
{}
# we will need the ngrams of the stoplist to filter
# we will need the ngrams of the stoplist to filter
...
@@ -60,7 +55,8 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
...
@@ -60,7 +55,8 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
my_groups
=
defaultdict
(
Counter
)
my_groups
=
defaultdict
(
Counter
)
# preloop per doc to sort ngrams by language
# preloop per doc to sort ngrams by language
for
doc
in
corpus
.
children
():
for
doc
in
corpus
.
children
(
'DOCUMENT'
):
if
doc
.
id
not
in
corpus
.
skipped_docs
:
if
(
'language_iso2'
in
doc
.
hyperdata
):
if
(
'language_iso2'
in
doc
.
hyperdata
):
lgid
=
doc
.
hyperdata
[
'language_iso2'
]
lgid
=
doc
.
hyperdata
[
'language_iso2'
]
else
:
else
:
...
...
gargantext/util/toolchain/ngrams_extraction.py
View file @
49552ff6
...
@@ -3,7 +3,6 @@ from gargantext.models import *
...
@@ -3,7 +3,6 @@ from gargantext.models import *
from
gargantext.constants
import
*
from
gargantext.constants
import
*
from
collections
import
defaultdict
from
collections
import
defaultdict
from
re
import
sub
from
re
import
sub
from
gargantext.util.scheduling
import
scheduled
from
gargantext.util.scheduling
import
scheduled
def
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
):
def
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
):
...
@@ -44,33 +43,26 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
...
@@ -44,33 +43,26 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
db
,
cursor
=
get_cursor
()
db
,
cursor
=
get_cursor
()
nodes_ngrams_count
=
defaultdict
(
int
)
nodes_ngrams_count
=
defaultdict
(
int
)
ngrams_data
=
set
()
ngrams_data
=
set
()
#
extract ngrams
#
1 corpus = 1 resource
resource
=
corpus
.
resources
()[
0
]
resource
=
corpus
.
resources
()[
0
]
source
=
get_resource
(
resource
[
"type"
])
documents_count
=
0
documents_count
=
0
source
=
get_resource
(
resource
[
"type"
])
#load available taggers for source default langage
#load available taggers for source default langage
#skipped documents that have been skipped previously for parsing error or unsupported language
tagger_bots
=
{
lang
:
load_tagger
(
lang
)
for
lang
in
corpus
.
languages
if
lang
!=
"__skipped__"
}
docs
=
[
doc
for
doc
in
corpus
.
children
(
'DOCUMENT'
)
if
doc
.
id
not
in
corpus
.
skipped_docs
]
docs
=
[
doc
for
doc
in
corpus
.
children
(
'DOCUMENT'
)
if
doc
.
id
not
in
corpus
.
skipped_docs
]
tagger_bots
=
{
lang
:
load_tagger
(
lang
)()
for
lang
in
corpus
.
languages
if
lang
!=
"__skipped__"
}
#sort docs by lang?
#sort docs by lang?
for
lang
,
tagger
in
tagger_bots
.
items
():
for
documents_count
,
document
in
enumerate
(
docs
):
for
documents_count
,
document
in
enumerate
(
docs
):
try
:
language_iso2
=
document
.
hyperdata
.
get
(
'language_iso2'
,
lang
)
lang_doc
=
document
.
hyperdata
[
"language_iso2"
]
#print(language_iso2)
except
AttributeError
:
print
(
"NO LANG DETECTED"
)
document
.
status
(
"NGRAMS"
,
error
=
"No lang detected?"
)
corpus
.
skipped_docs
.
append
(
document
.
id
)
continue
for
key
in
keys
:
for
key
in
keys
:
value
=
document
.
get
(
key
,
None
)
try
:
print
(
"VAL"
,
value
)
value
=
document
[
str
(
key
)]
if
not
isinstance
(
value
,
str
):
if
not
isinstance
(
value
,
str
):
continue
continue
# get ngrams
# get ngrams
for
ngram
in
tagger_bots
[
lang_doc
]
(
value
):
for
ngram
in
tagger
.
extract
(
value
):
tokens
=
tuple
(
normalize_forms
(
token
[
0
])
for
token
in
ngram
)
tokens
=
tuple
(
normalize_forms
(
token
[
0
])
for
token
in
ngram
)
print
(
"tk"
,
tokens
)
if
do_subngrams
:
if
do_subngrams
:
# ex tokens = ["very", "cool", "exemple"]
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# subterms = [['very', 'cool'],
...
@@ -88,7 +80,13 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
...
@@ -88,7 +80,13 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
nodes_ngrams_count
[(
document
.
id
,
ngram
)]
+=
1
nodes_ngrams_count
[(
document
.
id
,
ngram
)]
+=
1
# add fields : terms n
# add fields : terms n
ngrams_data
.
add
((
ngram
[:
255
],
len
(
seqterm
),
))
ngrams_data
.
add
((
ngram
[:
255
],
len
(
seqterm
),
))
except
:
#value not in doc
pass
# except AttributeError:
# print("ERROR NO language_iso2")
# document.status("NGRAMS", error="No lang detected skipped Ngrams")
# corpus.skipped_docs.append(document.id)
# integrate ngrams and nodes-ngrams
# integrate ngrams and nodes-ngrams
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
)
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
)
...
@@ -97,6 +95,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
...
@@ -97,6 +95,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
if
documents_count
%
BATCH_NGRAMSEXTRACTION_SIZE
==
0
:
if
documents_count
%
BATCH_NGRAMSEXTRACTION_SIZE
==
0
:
corpus
.
status
(
'Ngrams'
,
progress
=
documents_count
+
1
)
corpus
.
status
(
'Ngrams'
,
progress
=
documents_count
+
1
)
corpus
.
save_hyperdata
()
corpus
.
save_hyperdata
()
session
.
add
(
corpus
)
session
.
commit
()
session
.
commit
()
else
:
else
:
# integrate ngrams and nodes-ngrams
# integrate ngrams and nodes-ngrams
...
...
gargantext/util/toolchain/parsing.py
View file @
49552ff6
from
gargantext.util.db
import
*
from
gargantext.util.db
import
*
from
gargantext.models
import
*
from
gargantext.models
import
*
from
gargantext.constants
import
*
from
gargantext.constants
import
*
from
gargantext.util.parsers
import
*
#
from gargantext.util.parsers import *
from
collections
import
defaultdict
,
Counter
from
collections
import
defaultdict
,
Counter
from
re
import
sub
from
re
import
sub
...
@@ -11,13 +11,11 @@ def parse(corpus):
...
@@ -11,13 +11,11 @@ def parse(corpus):
corpus
.
status
(
'Docs'
,
progress
=
0
)
corpus
.
status
(
'Docs'
,
progress
=
0
)
#1 corpus => 1 resource
#1 corpus => 1 resource
resources
=
corpus
.
resources
()
resources
=
corpus
.
resources
()
#get the sources capabilities for a given corpus
#get the sources capabilities for a given corpus resource
#print(resource)
sources
=
[
get_resource
(
resource
[
"type"
])
for
resource
in
corpus
.
resources
()
if
resource
[
"extracted"
]
is
False
]
sources
=
[
get_resource
(
resource
[
"type"
])
for
resource
in
corpus
.
resources
()
if
resource
[
"extracted"
]
is
False
]
print
(
sources
)
if
len
(
sources
)
==
0
:
if
len
(
sources
)
==
0
:
#>>> documents have already been parsed?????
#>>> documents have already been parsed?????
pass
return
if
len
(
sources
)
>
0
:
if
len
(
sources
)
>
0
:
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle
source
=
sources
[
0
]
source
=
sources
[
0
]
...
@@ -27,25 +25,24 @@ def parse(corpus):
...
@@ -27,25 +25,24 @@ def parse(corpus):
#corpus.status(error)
#corpus.status(error)
raise
ValueError
(
"Resource '
%
s' has no Parser"
%
resource
[
"name"
])
raise
ValueError
(
"Resource '
%
s' has no Parser"
%
resource
[
"name"
])
else
:
else
:
#observed langages in corpus docs
corpus
.
languages
=
defaultdict
.
fromkeys
(
source
[
"default_languages"
],
0
)
corpus
.
languages
=
defaultdict
.
fromkeys
(
source
[
"default_languages"
],
0
)
skipped_languages
=
[]
#remember the skipped docs in parsing
corpus
.
skipped_docs
=
[]
corpus
.
skipped_docs
=
[]
session
.
add
(
corpus
)
session
.
add
(
corpus
)
session
.
commit
()
session
.
commit
()
#load the corresponding parser
#load the corresponding parser
parserbot
=
load_parser
(
source
)
parserbot
=
load_parser
(
source
)
skipped_languages
=
[]
# extract and insert documents from resource.path into database
# extract and insert documents from resource.path into database
#print(resource["path"])
for
hyperdata
in
parserbot
(
resource
[
"path"
]):
for
hyperdata
in
parserbot
(
resource
[
"path"
]):
# indexed text fields defined in
constants
# indexed text fields defined in
CONSTANTS
for
k
in
DEFAULT_INDEX_FIELDS
:
for
k
in
DEFAULT_INDEX_FIELDS
:
if
k
in
hyperdata
.
keys
():
if
k
in
hyperdata
.
keys
():
try
:
try
:
hyperdata
[
k
]
=
normalize_chars
(
hyperdata
[
k
])
hyperdata
[
k
]
=
normalize_chars
(
hyperdata
[
k
])
except
Exception
as
error
:
except
Exception
as
error
:
hyperdata
[
"error"
]
=
"Error normalize_chars"
hyperdata
[
"error"
]
=
"Error normalize_chars"
# a simple census to raise language info at corpus level
# a simple census to raise language info at corpus level
if
"language_iso2"
in
hyperdata
.
keys
():
if
"language_iso2"
in
hyperdata
.
keys
():
try
:
try
:
...
@@ -54,17 +51,26 @@ def parse(corpus):
...
@@ -54,17 +51,26 @@ def parse(corpus):
print
(
"KeyError"
,
hyperdata
[
"language_iso2"
])
print
(
"KeyError"
,
hyperdata
[
"language_iso2"
])
hyperdata
[
"error"
]
=
"Error: unsupported language"
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
elif
"language_
iso3
"
in
hyperdata
.
keys
():
elif
"language_
fullname
"
in
hyperdata
.
keys
():
try
:
try
:
lang
=
language
[
hyperdata
[
"language_iso3"
]]
#full => iso2
lang
=
languages
[
hyperdata
[
"language_fullname"
]]
.
name
.
lower
()
corpus
.
languages
[
lang
]
+=
1
corpus
.
languages
[
lang
]
+=
1
except
KeyError
:
except
KeyError
:
print
(
"KeyError"
,
lang
)
print
(
"KeyError"
,
hyperdata
[
"language_fullname"
]
)
hyperdata
[
"error"
]
=
"Error: unsupported language"
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
]
)
skipped_languages
.
append
(
lang
)
else
:
else
:
raise
ValueError
(
"PARSING ERROR: No lang detected"
)
pass
#no language have been indexed
#detectlang by index_fields
# for k in DEFAULT_INDEX_FIELDS:
# if k in hyperdata.keys():
# try:
# hyperdata["language_iso2"] = langdetect(hyperdata[k])
# except Exception as error :
# pass
#print(hyperdata.keys())
# save as DB child
# save as DB child
# ----------------
# ----------------
document
=
corpus
.
add_child
(
document
=
corpus
.
add_child
(
...
@@ -76,11 +82,12 @@ def parse(corpus):
...
@@ -76,11 +82,12 @@ def parse(corpus):
if
"error"
in
hyperdata
.
keys
():
if
"error"
in
hyperdata
.
keys
():
#document.status("error")
#document.status("error")
print
(
hyperdata
[
"error"
])
document
.
status
(
'Parsing'
,
error
=
document
.
hyperdata
[
"error"
])
document
.
status
(
'Parsing'
,
error
=
document
.
hyperdata
[
"error"
])
document
.
save_hyperdata
()
document
.
save_hyperdata
()
session
.
commit
()
session
.
commit
()
#adding skipped_docs for later processsing
corpus
.
skipped_docs
.
append
(
document
.
id
)
corpus
.
skipped_docs
.
append
(
document
.
id
)
documents_count
+=
1
# logging
# logging
if
documents_count
%
BATCH_PARSING_SIZE
==
0
:
if
documents_count
%
BATCH_PARSING_SIZE
==
0
:
...
@@ -92,11 +99,13 @@ def parse(corpus):
...
@@ -92,11 +99,13 @@ def parse(corpus):
documents_count
+=
1
# update info about the resource
# update info about the resource
resource
[
'extracted'
]
=
True
resource
[
'extracted'
]
=
True
# add a corpus-level info about languages adding a __skipped__ info
# add a corpus-level info about languages adding a __skipped__ info
corpus
.
languages
[
'__skipped__'
]
=
Counter
(
skipped_languages
)
corpus
.
languages
[
'__skipped__'
]
=
Counter
(
skipped_languages
)
for
n
in
corpus
.
languages
.
items
():
print
(
n
)
# commit all changes
# commit all changes
corpus
.
status
(
'Docs'
,
progress
=
documents_count
,
complete
=
True
)
corpus
.
status
(
'Docs'
,
progress
=
documents_count
,
complete
=
True
)
corpus
.
save_hyperdata
()
corpus
.
save_hyperdata
()
...
...
gargantext/views/pages/corpora.py
View file @
49552ff6
...
@@ -37,7 +37,7 @@ def docs_by_titles(request, project_id, corpus_id):
...
@@ -37,7 +37,7 @@ def docs_by_titles(request, project_id, corpus_id):
'date'
:
datetime
.
now
(),
'date'
:
datetime
.
now
(),
'project'
:
project
,
'project'
:
project
,
'corpus'
:
corpus
,
'corpus'
:
corpus
,
'resourcename'
:
resourcename
(
corpus
),
'resourcename'
:
get_resource_by_name
(
corpus
.
resources
()[
0
]
),
'view'
:
'titles'
,
'view'
:
'titles'
,
'user'
:
request
.
user
'user'
:
request
.
user
},
},
...
@@ -65,7 +65,7 @@ def docs_by_journals(request, project_id, corpus_id):
...
@@ -65,7 +65,7 @@ def docs_by_journals(request, project_id, corpus_id):
'date'
:
datetime
.
now
(),
'date'
:
datetime
.
now
(),
'project'
:
project
,
'project'
:
project
,
'corpus'
:
corpus
,
'corpus'
:
corpus
,
'resourcename'
:
resourcename
(
corpus
),
'resourcename'
:
get_resource_by_name
(
corpus
.
resources
()[
0
]
),
'view'
:
'journals'
'view'
:
'journals'
},
},
)
)
...
@@ -84,11 +84,8 @@ def analytics(request, project_id, corpus_id):
...
@@ -84,11 +84,8 @@ def analytics(request, project_id, corpus_id):
'date'
:
datetime
.
now
(),
'date'
:
datetime
.
now
(),
'project'
:
project
,
'project'
:
project
,
'corpus'
:
corpus
,
'corpus'
:
corpus
,
'resourcename'
:
resourcename
(
corpus
),
'resourcename'
:
get_resource_by_name
(
corpus
.
resources
()[
0
]
),
'view'
:
'analytics'
,
'view'
:
'analytics'
,
'user'
:
request
.
user
'user'
:
request
.
user
},
},
)
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment