Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
d37bdbd3
Commit
d37bdbd3
authored
May 12, 2015
by
Administrator
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'testing' into prod-dev
parents
a1b68438
407b96ab
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
97 additions
and
63 deletions
+97
-63
EuropressFileParser.py
parsing/FileParsers/EuropressFileParser.py
+36
-36
FileParser.py
parsing/FileParsers/FileParser.py
+14
-12
RisFileParser.py
parsing/FileParsers/RisFileParser.py
+12
-4
ZoteroFileParser.py
parsing/FileParsers/ZoteroFileParser.py
+23
-0
__init__.py
parsing/FileParsers/__init__.py
+1
-0
corpustools.py
parsing/corpustools.py
+9
-9
parsers_config.py
parsing/parsers_config.py
+2
-2
No files found.
parsing/FileParsers/EuropressFileParser.py
View file @
d37bdbd3
...
@@ -11,7 +11,7 @@ from ..NgramsExtractors import *
...
@@ -11,7 +11,7 @@ from ..NgramsExtractors import *
from
admin.utils
import
PrintException
from
admin.utils
import
PrintException
class
EuropressFileParser
(
FileParser
):
class
EuropressFileParser
(
FileParser
):
def
_parse
(
self
,
file
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
localeEncoding
=
"fr_FR"
...
@@ -39,21 +39,21 @@ class EuropressFileParser(FileParser):
...
@@ -39,21 +39,21 @@ class EuropressFileParser(FileParser):
try
:
try
:
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
try
:
try
:
format_europresse
=
50
format_europresse
=
50
html_articles
=
html
.
xpath
(
'/html/body/table/tbody'
)
html_articles
=
html
.
xpath
(
'/html/body/table/tbody'
)
if
len
(
html_articles
)
<
1
:
if
len
(
html_articles
)
<
1
:
html_articles
=
html
.
xpath
(
'/html/body/table'
)
html_articles
=
html
.
xpath
(
'/html/body/table'
)
if
len
(
html_articles
)
<
1
:
if
len
(
html_articles
)
<
1
:
format_europresse
=
1
format_europresse
=
1
html_articles
=
html
.
xpath
(
'//div[@id="docContain"]'
)
html_articles
=
html
.
xpath
(
'//div[@id="docContain"]'
)
except
:
except
:
PrintException
()
PrintException
()
if
format_europresse
==
50
:
if
format_europresse
==
50
:
name_xpath
=
"./tr/td/span[@class = 'DocPublicationName']"
name_xpath
=
"./tr/td/span[@class = 'DocPublicationName']"
header_xpath
=
"./tr/td/span[@class = 'DocHeader']"
header_xpath
=
"./tr/td/span[@class = 'DocHeader']"
...
@@ -77,7 +77,7 @@ class EuropressFileParser(FileParser):
...
@@ -77,7 +77,7 @@ class EuropressFileParser(FileParser):
or self::td[@class='txtCertificat']
\
or self::td[@class='txtCertificat']
\
)]/text()"
)]/text()"
doi_xpath
=
"//span[@id='ucPubliC_lblNodoc']/text()"
doi_xpath
=
"//span[@id='ucPubliC_lblNodoc']/text()"
except
Exception
as
error
:
except
Exception
as
error
:
PrintException
()
PrintException
()
...
@@ -85,9 +85,9 @@ class EuropressFileParser(FileParser):
...
@@ -85,9 +85,9 @@ class EuropressFileParser(FileParser):
# parse all the articles, one by one
# parse all the articles, one by one
try
:
try
:
for
html_article
in
html_articles
:
for
html_article
in
html_articles
:
hyperdata
=
{}
hyperdata
=
{}
if
len
(
html_article
):
if
len
(
html_article
):
for
name
in
html_article
.
xpath
(
name_xpath
):
for
name
in
html_article
.
xpath
(
name_xpath
):
if
name
.
text
is
not
None
:
if
name
.
text
is
not
None
:
...
@@ -98,26 +98,26 @@ class EuropressFileParser(FileParser):
...
@@ -98,26 +98,26 @@ class EuropressFileParser(FileParser):
hyperdata
[
'volume'
]
=
test_journal
.
group
(
2
)
hyperdata
[
'volume'
]
=
test_journal
.
group
(
2
)
else
:
else
:
hyperdata
[
'journal'
]
=
name
.
text
.
encode
(
codif
)
hyperdata
[
'journal'
]
=
name
.
text
.
encode
(
codif
)
countbis
=
0
countbis
=
0
for
header
in
html_article
.
xpath
(
header_xpath
):
for
header
in
html_article
.
xpath
(
header_xpath
):
# print(count)
# print(count)
# countbis += 1
# countbis += 1
# try:
# try:
# print('109', hyperdata['publication_date'])
# print('109', hyperdata['publication_date'])
# except:
# except:
# print('no date yet')
# print('no date yet')
# pass
# pass
try
:
try
:
text
=
header
.
text
text
=
header
.
text
#print("header", text)
#print("header", text)
except
Exception
as
error
:
except
Exception
as
error
:
print
(
error
)
print
(
error
)
if
isinstance
(
text
,
bytes
):
if
isinstance
(
text
,
bytes
):
text
=
text
.
decode
(
encoding
)
text
=
text
.
decode
(
encoding
)
format_date_fr
=
re
.
compile
(
'
\
d*
\
s*
\
w+
\
s+
\
d{4}'
,
re
.
UNICODE
)
format_date_fr
=
re
.
compile
(
'
\
d*
\
s*
\
w+
\
s+
\
d{4}'
,
re
.
UNICODE
)
...
@@ -134,9 +134,9 @@ class EuropressFileParser(FileParser):
...
@@ -134,9 +134,9 @@ class EuropressFileParser(FileParser):
test_date_en
=
None
test_date_en
=
None
test_sect
=
None
test_sect
=
None
test_page
=
None
test_page
=
None
if
test_date_fr
is
not
None
:
if
test_date_fr
is
not
None
:
self
.
localeEncoding
=
"fr_FR"
self
.
localeEncoding
=
"fr_FR"
locale
.
setlocale
(
locale
.
LC_ALL
,
localeEncoding
)
locale
.
setlocale
(
locale
.
LC_ALL
,
localeEncoding
)
...
@@ -158,7 +158,7 @@ class EuropressFileParser(FileParser):
...
@@ -158,7 +158,7 @@ class EuropressFileParser(FileParser):
except
Exception
as
error
:
except
Exception
as
error
:
print
(
error
,
text
)
print
(
error
,
text
)
pass
pass
if
test_date_en
is
not
None
:
if
test_date_en
is
not
None
:
localeEncoding
=
"en_GB.UTF-8"
localeEncoding
=
"en_GB.UTF-8"
...
@@ -173,20 +173,20 @@ class EuropressFileParser(FileParser):
...
@@ -173,20 +173,20 @@ class EuropressFileParser(FileParser):
if
test_sect
is
not
None
:
if
test_sect
is
not
None
:
hyperdata
[
'section'
]
=
test_sect
.
group
(
1
)
.
encode
(
codif
)
hyperdata
[
'section'
]
=
test_sect
.
group
(
1
)
.
encode
(
codif
)
if
test_page
is
not
None
:
if
test_page
is
not
None
:
hyperdata
[
'page'
]
=
test_page
.
group
(
1
)
.
encode
(
codif
)
hyperdata
[
'page'
]
=
test_page
.
group
(
1
)
.
encode
(
codif
)
try
:
#
try:
print
(
'183'
,
hyperdata
[
'publication_date'
])
#
print('183', hyperdata['publication_date'])
except
:
#
except:
print
(
'no date yet'
)
#
print('no date yet')
pass
#
pass
#
hyperdata
[
'title'
]
=
html_article
.
xpath
(
title_xpath
)
.
encode
(
codif
)
hyperdata
[
'title'
]
=
html_article
.
xpath
(
title_xpath
)
.
encode
(
codif
)
hyperdata
[
'abstract'
]
=
html_article
.
xpath
(
text_xpath
)
hyperdata
[
'abstract'
]
=
html_article
.
xpath
(
text_xpath
)
line
=
0
line
=
0
br_tag
=
10
br_tag
=
10
for
i
in
html_articles
[
count
]
.
iter
():
for
i
in
html_articles
[
count
]
.
iter
():
...
@@ -205,13 +205,13 @@ class EuropressFileParser(FileParser):
...
@@ -205,13 +205,13 @@ class EuropressFileParser(FileParser):
hyperdata
[
'authors'
]
=
'not found'
hyperdata
[
'authors'
]
=
'not found'
line
=
0
line
=
0
br_tag
=
10
br_tag
=
10
try
:
try
:
if
hyperdata
[
'publication_date'
]
is
not
None
or
hyperdata
[
'publication_date'
]
!=
''
:
if
hyperdata
[
'publication_date'
]
is
not
None
or
hyperdata
[
'publication_date'
]
!=
''
:
try
:
try
:
back
=
hyperdata
[
'publication_date'
]
back
=
hyperdata
[
'publication_date'
]
except
Exception
as
e
:
except
Exception
as
e
:
#print(e)
#print(e)
pass
pass
else
:
else
:
...
@@ -226,14 +226,14 @@ class EuropressFileParser(FileParser):
...
@@ -226,14 +226,14 @@ class EuropressFileParser(FileParser):
#hyperdata['language_iso2'] = 'fr'
#hyperdata['language_iso2'] = 'fr'
#elif lang == 'en':
#elif lang == 'en':
# hyperdata['language_iso2'] = 'en'
# hyperdata['language_iso2'] = 'en'
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
#hyperdata.pop('publication_date')
#hyperdata.pop('publication_date')
if
len
(
hyperdata
[
'abstract'
])
>
0
and
format_europresse
==
50
:
if
len
(
hyperdata
[
'abstract'
])
>
0
and
format_europresse
==
50
:
hyperdata
[
'doi'
]
=
str
(
hyperdata
[
'abstract'
][
-
9
])
hyperdata
[
'doi'
]
=
str
(
hyperdata
[
'abstract'
][
-
9
])
hyperdata
[
'abstract'
]
.
pop
()
hyperdata
[
'abstract'
]
.
pop
()
# Here add separator for paragraphs
# Here add separator for paragraphs
...
@@ -245,15 +245,15 @@ class EuropressFileParser(FileParser):
...
@@ -245,15 +245,15 @@ class EuropressFileParser(FileParser):
# Here add separator for paragraphs
# Here add separator for paragraphs
hyperdata
[
'abstract'
]
=
str
(
' '
.
join
(
hyperdata
[
'abstract'
]))
hyperdata
[
'abstract'
]
=
str
(
' '
.
join
(
hyperdata
[
'abstract'
]))
else
:
else
:
hyperdata
[
'doi'
]
=
"not found"
hyperdata
[
'doi'
]
=
"not found"
hyperdata
[
'length_words'
]
=
len
(
hyperdata
[
'abstract'
]
.
split
(
' '
))
hyperdata
[
'length_words'
]
=
len
(
hyperdata
[
'abstract'
]
.
split
(
' '
))
hyperdata
[
'length_letters'
]
=
len
(
hyperdata
[
'abstract'
])
hyperdata
[
'length_letters'
]
=
len
(
hyperdata
[
'abstract'
])
hyperdata
[
'bdd'
]
=
u'europresse'
hyperdata
[
'bdd'
]
=
u'europresse'
hyperdata
[
'url'
]
=
u''
hyperdata
[
'url'
]
=
u''
#hyperdata_str = {}
#hyperdata_str = {}
for
key
,
value
in
hyperdata
.
items
():
for
key
,
value
in
hyperdata
.
items
():
hyperdata
[
key
]
=
value
.
decode
()
if
isinstance
(
value
,
bytes
)
else
value
hyperdata
[
key
]
=
value
.
decode
()
if
isinstance
(
value
,
bytes
)
else
value
...
...
parsing/FileParsers/FileParser.py
View file @
d37bdbd3
...
@@ -4,21 +4,21 @@ import zipfile
...
@@ -4,21 +4,21 @@ import zipfile
import
chardet
import
chardet
from
..Caches
import
LanguagesCache
from
..Caches
import
LanguagesCache
class
FileParser
:
class
FileParser
:
"""Base class for performing files parsing depending on their type.
"""Base class for performing files parsing depending on their type.
"""
"""
def
__init__
(
self
,
language_cache
=
None
):
def
__init__
(
self
,
language_cache
=
None
):
self
.
_languages_cache
=
LanguagesCache
()
if
language_cache
is
None
else
language_cache
self
.
_languages_cache
=
LanguagesCache
()
if
language_cache
is
None
else
language_cache
def
detect_encoding
(
self
,
string
):
def
detect_encoding
(
self
,
string
):
"""Useful method to detect the document encoding.
"""Useful method to detect the document encoding.
"""
"""
encoding
=
chardet
.
detect
(
string
)
encoding
=
chardet
.
detect
(
string
)
return
encoding
.
get
(
'encoding'
,
'UTF-8'
)
return
encoding
.
get
(
'encoding'
,
'UTF-8'
)
def
format_hyperdata_dates
(
self
,
hyperdata
):
def
format_hyperdata_dates
(
self
,
hyperdata
):
"""Format the dates found in the hyperdata.
"""Format the dates found in the hyperdata.
Examples:
Examples:
...
@@ -27,7 +27,7 @@ class FileParser:
...
@@ -27,7 +27,7 @@ class FileParser:
{"publication_year": "2014"}
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
"""
# First, check the split dates...
# First, check the split dates...
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_year"
]
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_year"
]
for
prefix
in
prefixes
:
for
prefix
in
prefixes
:
...
@@ -51,21 +51,23 @@ class FileParser:
...
@@ -51,21 +51,23 @@ class FileParser:
hyperdata
[
prefix
+
"_date"
]
=
dateutil
.
parser
.
parse
(
date_string
)
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
hyperdata
[
prefix
+
"_date"
]
=
dateutil
.
parser
.
parse
(
date_string
)
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
except
:
except
:
pass
pass
# ...then parse all the "date" fields, to parse it into separate elements
# ...then parse all the "date" fields, to parse it into separate elements
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_date"
]
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_date"
]
for
prefix
in
prefixes
:
for
prefix
in
prefixes
:
date
=
dateutil
.
parser
.
parse
(
hyperdata
[
prefix
+
"_date"
])
date
=
dateutil
.
parser
.
parse
(
hyperdata
[
prefix
+
"_date"
])
print
(
'date'
)
hyperdata
[
prefix
+
"_year"
]
=
date
.
strftime
(
"
%
Y"
)
hyperdata
[
prefix
+
"_year"
]
=
date
.
strftime
(
"
%
Y"
)
hyperdata
[
prefix
+
"_month"
]
=
date
.
strftime
(
"
%
m"
)
hyperdata
[
prefix
+
"_month"
]
=
date
.
strftime
(
"
%
m"
)
hyperdata
[
prefix
+
"_day"
]
=
date
.
strftime
(
"
%
d"
)
hyperdata
[
prefix
+
"_day"
]
=
date
.
strftime
(
"
%
d"
)
hyperdata
[
prefix
+
"_hour"
]
=
date
.
strftime
(
"
%
H"
)
hyperdata
[
prefix
+
"_hour"
]
=
date
.
strftime
(
"
%
H"
)
hyperdata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
M"
)
hyperdata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
M"
)
hyperdata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
S"
)
hyperdata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
S"
)
# finally, return the transformed result!
# finally, return the transformed result!
return
hyperdata
return
hyperdata
def
format_hyperdata_languages
(
self
,
hyperdata
):
def
format_hyperdata_languages
(
self
,
hyperdata
):
"""format the languages found in the hyperdata."""
"""format the languages found in the hyperdata."""
language
=
None
language
=
None
...
@@ -81,18 +83,18 @@ class FileParser:
...
@@ -81,18 +83,18 @@ class FileParser:
hyperdata
[
"language_iso3"
]
=
language
.
iso3
hyperdata
[
"language_iso3"
]
=
language
.
iso3
hyperdata
[
"language_fullname"
]
=
language
.
fullname
hyperdata
[
"language_fullname"
]
=
language
.
fullname
return
hyperdata
return
hyperdata
def
format_hyperdata
(
self
,
hyperdata
):
def
format_hyperdata
(
self
,
hyperdata
):
"""Format the hyperdata."""
"""Format the hyperdata."""
hyperdata
=
self
.
format_hyperdata_dates
(
hyperdata
)
hyperdata
=
self
.
format_hyperdata_dates
(
hyperdata
)
hyperdata
=
self
.
format_hyperdata_languages
(
hyperdata
)
hyperdata
=
self
.
format_hyperdata_languages
(
hyperdata
)
return
hyperdata
return
hyperdata
def
_parse
(
self
,
file
):
def
_parse
(
self
,
file
):
"""This method shall be overriden by inherited classes."""
"""This method shall be overriden by inherited classes."""
return
list
()
return
list
()
def
parse
(
self
,
file
):
def
parse
(
self
,
file
):
"""Parse the file, and its children files found in the file.
"""Parse the file, and its children files found in the file.
"""
"""
...
...
parsing/FileParsers/RisFileParser.py
View file @
d37bdbd3
...
@@ -3,15 +3,17 @@ from .FileParser import FileParser
...
@@ -3,15 +3,17 @@ from .FileParser import FileParser
from
..Caches
import
LanguagesCache
from
..Caches
import
LanguagesCache
from
admin.utils
import
PrintException
class
RisFileParser
(
FileParser
):
class
RisFileParser
(
FileParser
):
def
__init__
(
self
,
language_cache
=
None
):
def
__init__
(
self
,
language_cache
=
None
):
super
(
FileParser
,
self
)
.
__init__
()
super
(
FileParser
,
self
)
.
__init__
()
self
.
_languages_cache
=
LanguagesCache
()
if
language_cache
is
None
else
language_cache
self
.
_languages_cache
=
LanguagesCache
()
if
language_cache
is
None
else
language_cache
self
.
_begin
=
6
self
.
_begin
=
6
self
.
_parameters
=
{
self
.
_parameters
=
{
b
"ER"
:
{
"type"
:
"delimiter"
},
b
"ER"
:
{
"type"
:
"delimiter"
},
b
"TI"
:
{
"type"
:
"hyperdata"
,
"key"
:
"title"
,
"separator"
:
" "
},
b
"TI"
:
{
"type"
:
"hyperdata"
,
"key"
:
"title"
,
"separator"
:
" "
},
...
@@ -24,7 +26,7 @@ class RisFileParser(FileParser):
...
@@ -24,7 +26,7 @@ class RisFileParser(FileParser):
b
"AB"
:
{
"type"
:
"hyperdata"
,
"key"
:
"abstract"
,
"separator"
:
" "
},
b
"AB"
:
{
"type"
:
"hyperdata"
,
"key"
:
"abstract"
,
"separator"
:
" "
},
b
"WC"
:
{
"type"
:
"hyperdata"
,
"key"
:
"fields"
},
b
"WC"
:
{
"type"
:
"hyperdata"
,
"key"
:
"fields"
},
}
}
def
_parse
(
self
,
file
):
def
_parse
(
self
,
file
):
hyperdata
=
{}
hyperdata
=
{}
...
@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
...
@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
print
(
error
)
print
(
error
)
# if a hyperdata object is left in memory, yield it as well
# if a hyperdata object is left in memory, yield it as well
if
hyperdata
:
if
hyperdata
:
# try:
# if hyperdata['date_to_parse']:
# print(hyperdata['date_to_parse'])
# except:
# pass
#
#print(hyperdata['title'])
#print(hyperdata['title'])
yield
hyperdata
yield
hyperdata
parsing/FileParsers/ZoteroFileParser.py
0 → 100644
View file @
d37bdbd3
from
.RisFileParser
import
RisFileParser
from
..Caches
import
LanguagesCache
class
ZoteroFileParser
(
RisFileParser
):
def
__init__
(
self
):
super
(
RisFileParser
,
self
)
.
__init__
()
self
.
_begin
=
6
self
.
_parameters
=
{
b
"ER"
:
{
"type"
:
"delimiter"
},
b
"TI"
:
{
"type"
:
"hyperdata"
,
"key"
:
"title"
,
"separator"
:
" "
},
b
"AU"
:
{
"type"
:
"hyperdata"
,
"key"
:
"authors"
,
"separator"
:
", "
},
b
"UR"
:
{
"type"
:
"hyperdata"
,
"key"
:
"doi"
},
b
"DA"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_date"
},
b
"PY"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_year"
},
b
"PD"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_month"
},
b
"LA"
:
{
"type"
:
"hyperdata"
,
"key"
:
"language_iso2"
},
b
"AB"
:
{
"type"
:
"hyperdata"
,
"key"
:
"abstract"
,
"separator"
:
" "
},
b
"WC"
:
{
"type"
:
"hyperdata"
,
"key"
:
"fields"
},
}
parsing/FileParsers/__init__.py
View file @
d37bdbd3
from
.RisFileParser
import
RisFileParser
from
.RisFileParser
import
RisFileParser
from
.IsiFileParser
import
IsiFileParser
from
.IsiFileParser
import
IsiFileParser
from
.JstorFileParser
import
JstorFileParser
from
.JstorFileParser
import
JstorFileParser
from
.ZoteroFileParser
import
ZoteroFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.EuropressFileParser
import
EuropressFileParser
from
.EuropressFileParser
import
EuropressFileParser
from
.ISText
import
ISText
from
.ISText
import
ISText
parsing/corpustools.py
View file @
d37bdbd3
...
@@ -128,7 +128,7 @@ def parse_resources(corpus, user=None, user_id=None):
...
@@ -128,7 +128,7 @@ def parse_resources(corpus, user=None, user_id=None):
nodes
.
append
(
node
)
nodes
.
append
(
node
)
#
#
# TODO: mark node-resources associations as parsed
# TODO: mark node-resources associations as parsed
#
#
dbg
.
show
(
'insert
%
d documents'
%
len
(
nodes
))
dbg
.
show
(
'insert
%
d documents'
%
len
(
nodes
))
session
.
add_all
(
nodes
)
session
.
add_all
(
nodes
)
session
.
commit
()
session
.
commit
()
...
@@ -205,7 +205,7 @@ def extract_ngrams(corpus, keys):
...
@@ -205,7 +205,7 @@ def extract_ngrams(corpus, keys):
language
.
id
:
language
.
iso2
language
.
id
:
language
.
iso2
for
language
in
session
.
query
(
Language
)
for
language
in
session
.
query
(
Language
)
}
}
ngrams_data
=
set
()
ngrams_data
=
set
()
ngrams_language_data
=
set
()
ngrams_language_data
=
set
()
ngrams_tag_data
=
set
()
ngrams_tag_data
=
set
()
...
@@ -241,7 +241,7 @@ def extract_ngrams(corpus, keys):
...
@@ -241,7 +241,7 @@ def extract_ngrams(corpus, keys):
#tag_id = 14
#tag_id = 14
#print('tag_id_2', tag_id)
#print('tag_id_2', tag_id)
node_ngram_list
[
node_id
][
terms
]
+=
1
node_ngram_list
[
node_id
][
terms
]
+=
1
ngrams_data
.
add
((
n
,
terms
))
ngrams_data
.
add
((
n
,
terms
[:
255
]
))
ngrams_language_data
.
add
((
terms
,
language_id
))
ngrams_language_data
.
add
((
terms
,
language_id
))
ngrams_tag_data
.
add
((
terms
,
tag_id
))
ngrams_tag_data
.
add
((
terms
,
tag_id
))
...
@@ -268,7 +268,7 @@ def extract_ngrams(corpus, keys):
...
@@ -268,7 +268,7 @@ def extract_ngrams(corpus, keys):
ngram.terms = tmp__ngrams.terms
ngram.terms = tmp__ngrams.terms
'''
%
(
Ngram
.
__table__
.
name
,
))
'''
%
(
Ngram
.
__table__
.
name
,
))
# insert, then get the ids back
# insert, then get the ids back
cursor
.
execute
(
'''
cursor
.
execute
(
'''
INSERT INTO
INSERT INTO
%
s (n, terms)
%
s (n, terms)
...
@@ -279,8 +279,8 @@ def extract_ngrams(corpus, keys):
...
@@ -279,8 +279,8 @@ def extract_ngrams(corpus, keys):
WHERE
WHERE
id IS NULL
id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,
))
'''
%
(
Ngram
.
__table__
.
name
,
))
cursor
.
execute
(
'''
cursor
.
execute
(
'''
UPDATE
UPDATE
tmp__ngrams
tmp__ngrams
...
@@ -293,14 +293,14 @@ def extract_ngrams(corpus, keys):
...
@@ -293,14 +293,14 @@ def extract_ngrams(corpus, keys):
AND
AND
tmp__ngrams.id IS NULL
tmp__ngrams.id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,
))
'''
%
(
Ngram
.
__table__
.
name
,
))
# get all ids
# get all ids
ngram_ids
=
dict
()
ngram_ids
=
dict
()
cursor
.
execute
(
'SELECT id, terms FROM tmp__ngrams'
)
cursor
.
execute
(
'SELECT id, terms FROM tmp__ngrams'
)
for
row
in
cursor
.
fetchall
():
for
row
in
cursor
.
fetchall
():
ngram_ids
[
row
[
1
]]
=
row
[
0
]
ngram_ids
[
row
[
1
]]
=
row
[
0
]
#
#
dbg
.
show
(
'insert associations'
)
dbg
.
show
(
'insert associations'
)
node_ngram_data
=
list
()
node_ngram_data
=
list
()
for
node_id
,
ngrams
in
node_ngram_list
.
items
():
for
node_id
,
ngrams
in
node_ngram_list
.
items
():
...
...
parsing/parsers_config.py
View file @
d37bdbd3
...
@@ -4,11 +4,11 @@ parsers = {
...
@@ -4,11 +4,11 @@ parsers = {
'Pubmed (xml format)'
:
PubmedFileParser
,
'Pubmed (xml format)'
:
PubmedFileParser
,
'Web of Science (ISI format)'
:
IsiFileParser
,
'Web of Science (ISI format)'
:
IsiFileParser
,
'Scopus (RIS format)'
:
RisFileParser
,
'Scopus (RIS format)'
:
RisFileParser
,
'Zotero (RIS format)'
:
Jstor
FileParser
,
'Zotero (RIS format)'
:
Zotero
FileParser
,
'Jstor (RIS format)'
:
JstorFileParser
,
'Jstor (RIS format)'
:
JstorFileParser
,
#'Europress' : EuropressFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)'
:
EuropressFileParser
,
'Europress (French)'
:
EuropressFileParser
,
'Europress (English)'
:
EuropressFileParser
,
'Europress (English)'
:
EuropressFileParser
,
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment