Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
5160d178
Commit
5160d178
authored
May 19, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Plain Diff
[UPDATE] merge conflict
parents
0853ddca
2644e642
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
614 additions
and
101 deletions
+614
-101
drop_db.sh
init/sql/drop_db.sh
+4
-0
drop_db.sql
init/sql/drop_db.sql
+0
-0
rename_metadata.sql
init/sql/rename_metadata.sql
+8
-0
models.py
node/models.py
+18
-19
EuropressFileParser.py
parsing/FileParsers/EuropressFileParser.py
+65
-49
FileParser.py
parsing/FileParsers/FileParser.py
+52
-27
RisFileParser.py
parsing/FileParsers/RisFileParser.py
+12
-4
ZoteroFileParser.py
parsing/FileParsers/ZoteroFileParser.py
+23
-0
__init__.py
parsing/FileParsers/__init__.py
+1
-0
corpustools.py
parsing/corpustools.py
+430
-0
parsers_config.py
parsing/parsers_config.py
+1
-2
No files found.
init/sql/drop_db.sh
0 → 100755
View file @
5160d178
export
PGPASSWORD
=
C8kdcUrAQy66U
psql
-U
gargantua
-d
gargandb
-f
drop_db.sql
init/sql/
init
.sql
→
init/sql/
drop_db
.sql
View file @
5160d178
File moved
init/sql/rename_metadata.sql
0 → 100644
View file @
5160d178
ALTER
TABLE
node_node
RENAME
metadata
TO
hyperdata
;
ALTER
TABLE
node_metadata
RENAME
TO
node_hyperdata
;
ALTER
TABLE
node_node_metadata
RENAME
TO
node_node_hyperdata
;
ALTER
TABLE
node_node_hyperdata
RENAME
metadata_id
TO
hyperdata_id
;
node/models.py
View file @
5160d178
...
@@ -306,7 +306,6 @@ class Node(CTENode):
...
@@ -306,7 +306,6 @@ class Node(CTENode):
self
.
hyperdata
[
'Processing'
]
=
0
self
.
hyperdata
[
'Processing'
]
=
0
self
.
save
()
self
.
save
()
class
Node_Hyperdata
(
models
.
Model
):
class
Node_Hyperdata
(
models
.
Model
):
node
=
models
.
ForeignKey
(
Node
,
on_delete
=
models
.
CASCADE
)
node
=
models
.
ForeignKey
(
Node
,
on_delete
=
models
.
CASCADE
)
hyperdata
=
models
.
ForeignKey
(
Hyperdata
)
hyperdata
=
models
.
ForeignKey
(
Hyperdata
)
...
...
parsing/FileParsers/EuropressFileParser.py
View file @
5160d178
...
@@ -8,7 +8,7 @@ import dateutil.parser
...
@@ -8,7 +8,7 @@ import dateutil.parser
from
.FileParser
import
FileParser
from
.FileParser
import
FileParser
from
..NgramsExtractors
import
*
from
..NgramsExtractors
import
*
from
admin.utils
import
PrintException
class
EuropressFileParser
(
FileParser
):
class
EuropressFileParser
(
FileParser
):
...
@@ -29,8 +29,8 @@ class EuropressFileParser(FileParser):
...
@@ -29,8 +29,8 @@ class EuropressFileParser(FileParser):
if
encoding
!=
"utf-8"
:
if
encoding
!=
"utf-8"
:
try
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
Exception
as
error
:
except
:
print
(
error
)
PrintException
(
)
# try:
# try:
# contents = contents.decode(encoding, errors='replace').encode(codif)
# contents = contents.decode(encoding, errors='replace').encode(codif)
# except Exception as error:
# except Exception as error:
...
@@ -40,7 +40,7 @@ class EuropressFileParser(FileParser):
...
@@ -40,7 +40,7 @@ class EuropressFileParser(FileParser):
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
try
:
try
:
format_europresse
=
50
format_europresse
=
50
html_articles
=
html
.
xpath
(
'/html/body/table/tbody'
)
html_articles
=
html
.
xpath
(
'/html/body/table/tbody'
)
...
@@ -51,15 +51,15 @@ class EuropressFileParser(FileParser):
...
@@ -51,15 +51,15 @@ class EuropressFileParser(FileParser):
if
len
(
html_articles
)
<
1
:
if
len
(
html_articles
)
<
1
:
format_europresse
=
1
format_europresse
=
1
html_articles
=
html
.
xpath
(
'//div[@id="docContain"]'
)
html_articles
=
html
.
xpath
(
'//div[@id="docContain"]'
)
except
Exception
as
error
:
except
:
print
(
error
)
PrintException
(
)
if
format_europresse
==
50
:
if
format_europresse
==
50
:
name_xpath
=
"./tr/td/span[@class = 'DocPublicationName']"
name_xpath
=
"./tr/td/span[@class = 'DocPublicationName']"
header_xpath
=
"/
/span[@class = 'DocHeader']"
header_xpath
=
"./tr/td
/span[@class = 'DocHeader']"
title_xpath
=
"string(./tr/td/span[@class = 'TitreArticleVisu'])"
title_xpath
=
"string(./tr/td/span[@class = 'TitreArticleVisu'])"
text_xpath
=
"./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
text_xpath
=
"./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
elif
format_europresse
==
1
:
elif
format_europresse
==
1
:
name_xpath
=
"//span[@class = 'DocPublicationName']"
name_xpath
=
"//span[@class = 'DocPublicationName']"
header_xpath
=
"//span[@class = 'DocHeader']"
header_xpath
=
"//span[@class = 'DocHeader']"
title_xpath
=
"string(//div[@class = 'titreArticleVisu'])"
title_xpath
=
"string(//div[@class = 'titreArticleVisu'])"
...
@@ -79,8 +79,8 @@ class EuropressFileParser(FileParser):
...
@@ -79,8 +79,8 @@ class EuropressFileParser(FileParser):
doi_xpath
=
"//span[@id='ucPubliC_lblNodoc']/text()"
doi_xpath
=
"//span[@id='ucPubliC_lblNodoc']/text()"
except
Exception
as
error
:
except
Exception
as
error
:
print
(
error
)
PrintException
(
)
# parse all the articles, one by one
# parse all the articles, one by one
try
:
try
:
...
@@ -99,7 +99,18 @@ class EuropressFileParser(FileParser):
...
@@ -99,7 +99,18 @@ class EuropressFileParser(FileParser):
else
:
else
:
hyperdata
[
'journal'
]
=
name
.
text
.
encode
(
codif
)
hyperdata
[
'journal'
]
=
name
.
text
.
encode
(
codif
)
countbis
=
0
for
header
in
html_article
.
xpath
(
header_xpath
):
for
header
in
html_article
.
xpath
(
header_xpath
):
# print(count)
# countbis += 1
# try:
# print('109', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
try
:
try
:
text
=
header
.
text
text
=
header
.
text
#print("header", text)
#print("header", text)
...
@@ -145,12 +156,10 @@ class EuropressFileParser(FileParser):
...
@@ -145,12 +156,10 @@ class EuropressFileParser(FileParser):
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
d
%
B
%
Y'
)
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
d
%
B
%
Y'
)
# hyperdata['publication_date'] = dateutil.parser.parse(text)
# hyperdata['publication_date'] = dateutil.parser.parse(text)
except
Exception
as
error
:
except
Exception
as
error
:
print
(
error
)
print
(
error
,
text
)
print
(
text
)
pass
pass
if
test_date_en
is
not
None
:
if
test_date_en
is
not
None
:
localeEncoding
=
"en_GB.UTF-8"
localeEncoding
=
"en_GB.UTF-8"
locale
.
setlocale
(
locale
.
LC_ALL
,
localeEncoding
)
locale
.
setlocale
(
locale
.
LC_ALL
,
localeEncoding
)
...
@@ -168,6 +177,13 @@ class EuropressFileParser(FileParser):
...
@@ -168,6 +177,13 @@ class EuropressFileParser(FileParser):
if
test_page
is
not
None
:
if
test_page
is
not
None
:
hyperdata
[
'page'
]
=
test_page
.
group
(
1
)
.
encode
(
codif
)
hyperdata
[
'page'
]
=
test_page
.
group
(
1
)
.
encode
(
codif
)
# try:
# print('183', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
#
hyperdata
[
'title'
]
=
html_article
.
xpath
(
title_xpath
)
.
encode
(
codif
)
hyperdata
[
'title'
]
=
html_article
.
xpath
(
title_xpath
)
.
encode
(
codif
)
hyperdata
[
'abstract'
]
=
html_article
.
xpath
(
text_xpath
)
hyperdata
[
'abstract'
]
=
html_article
.
xpath
(
text_xpath
)
...
@@ -215,7 +231,7 @@ class EuropressFileParser(FileParser):
...
@@ -215,7 +231,7 @@ class EuropressFileParser(FileParser):
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
hyperdata
.
pop
(
'publication_date'
)
#
hyperdata.pop('publication_date')
if
len
(
hyperdata
[
'abstract'
])
>
0
and
format_europresse
==
50
:
if
len
(
hyperdata
[
'abstract'
])
>
0
and
format_europresse
==
50
:
hyperdata
[
'doi'
]
=
str
(
hyperdata
[
'abstract'
][
-
9
])
hyperdata
[
'doi'
]
=
str
(
hyperdata
[
'abstract'
][
-
9
])
...
...
parsing/FileParsers/FileParser.py
View file @
5160d178
import
collections
import
collections
import
datetime
import
dateutil.parser
import
dateutil.parser
import
zipfile
import
zipfile
import
chardet
import
chardet
import
re
from
..Caches
import
LanguagesCache
from
..Caches
import
LanguagesCache
DEFAULT_DATE
=
datetime
.
datetime
(
datetime
.
MINYEAR
,
1
,
1
)
class
FileParser
:
class
FileParser
:
"""Base class for performing files parsing depending on their type.
"""Base class for performing files parsing depending on their type.
"""
"""
...
@@ -29,6 +34,21 @@ class FileParser:
...
@@ -29,6 +34,21 @@ class FileParser:
"""
"""
# First, check the split dates...
# First, check the split dates...
date_string
=
hyperdata
.
get
(
'publication_date_to_parse'
,
None
)
if
date_string
is
not
None
:
date_string
=
re
.
sub
(
r'\/\/+'
,
''
,
date_string
)
date_string
=
re
.
sub
(
r'undefined'
,
''
,
date_string
)
try
:
hyperdata
[
'publication'
+
"_date"
]
=
dateutil
.
parser
.
parse
(
date_string
,
default
=
DEFAULT_DATE
)
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
except
:
print
(
'Parser Zotero, Date not parsed for:'
,
date_string
)
hyperdata
[
'publication_date'
]
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
elif
hyperdata
.
get
(
'publication_year'
,
None
)
is
not
None
:
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_year"
]
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_year"
]
for
prefix
in
prefixes
:
for
prefix
in
prefixes
:
date_string
=
hyperdata
[
prefix
+
"_year"
]
date_string
=
hyperdata
[
prefix
+
"_year"
]
...
@@ -51,11 +71,15 @@ class FileParser:
...
@@ -51,11 +71,15 @@ class FileParser:
hyperdata
[
prefix
+
"_date"
]
=
dateutil
.
parser
.
parse
(
date_string
)
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
hyperdata
[
prefix
+
"_date"
]
=
dateutil
.
parser
.
parse
(
date_string
)
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
except
:
except
:
pass
pass
else
:
hyperdata
[
'publication_date'
]
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
# ...then parse all the "date" fields, to parse it into separate elements
# ...then parse all the "date" fields, to parse it into separate elements
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_date"
]
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_date"
]
for
prefix
in
prefixes
:
for
prefix
in
prefixes
:
date
=
dateutil
.
parser
.
parse
(
hyperdata
[
prefix
+
"_date"
])
date
=
dateutil
.
parser
.
parse
(
hyperdata
[
prefix
+
"_date"
])
#print(date)
hyperdata
[
prefix
+
"_year"
]
=
date
.
strftime
(
"
%
Y"
)
hyperdata
[
prefix
+
"_year"
]
=
date
.
strftime
(
"
%
Y"
)
hyperdata
[
prefix
+
"_month"
]
=
date
.
strftime
(
"
%
m"
)
hyperdata
[
prefix
+
"_month"
]
=
date
.
strftime
(
"
%
m"
)
hyperdata
[
prefix
+
"_day"
]
=
date
.
strftime
(
"
%
d"
)
hyperdata
[
prefix
+
"_day"
]
=
date
.
strftime
(
"
%
d"
)
...
@@ -65,6 +89,7 @@ class FileParser:
...
@@ -65,6 +89,7 @@ class FileParser:
# finally, return the transformed result!
# finally, return the transformed result!
return
hyperdata
return
hyperdata
print
(
hyperdata
[
'publication_date'
])
def
format_hyperdata_languages
(
self
,
hyperdata
):
def
format_hyperdata_languages
(
self
,
hyperdata
):
"""format the languages found in the hyperdata."""
"""format the languages found in the hyperdata."""
...
...
parsing/FileParsers/RisFileParser.py
View file @
5160d178
...
@@ -3,6 +3,8 @@ from .FileParser import FileParser
...
@@ -3,6 +3,8 @@ from .FileParser import FileParser
from
..Caches
import
LanguagesCache
from
..Caches
import
LanguagesCache
from
admin.utils
import
PrintException
class
RisFileParser
(
FileParser
):
class
RisFileParser
(
FileParser
):
def
__init__
(
self
,
language_cache
=
None
):
def
__init__
(
self
,
language_cache
=
None
):
...
@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
...
@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
print
(
error
)
print
(
error
)
# if a hyperdata object is left in memory, yield it as well
# if a hyperdata object is left in memory, yield it as well
if
hyperdata
:
if
hyperdata
:
# try:
# if hyperdata['date_to_parse']:
# print(hyperdata['date_to_parse'])
# except:
# pass
#
#print(hyperdata['title'])
#print(hyperdata['title'])
yield
hyperdata
yield
hyperdata
parsing/FileParsers/ZoteroFileParser.py
0 → 100644
View file @
5160d178
from
.RisFileParser
import
RisFileParser
from
..Caches
import
LanguagesCache
class
ZoteroFileParser
(
RisFileParser
):
def
__init__
(
self
):
super
(
RisFileParser
,
self
)
.
__init__
()
self
.
_begin
=
6
self
.
_parameters
=
{
b
"ER"
:
{
"type"
:
"delimiter"
},
b
"TI"
:
{
"type"
:
"hyperdata"
,
"key"
:
"title"
,
"separator"
:
" "
},
b
"AU"
:
{
"type"
:
"hyperdata"
,
"key"
:
"authors"
,
"separator"
:
", "
},
b
"UR"
:
{
"type"
:
"hyperdata"
,
"key"
:
"doi"
},
b
"DA"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_date_to_parse"
},
b
"PY"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_year"
},
b
"PD"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_month"
},
b
"LA"
:
{
"type"
:
"hyperdata"
,
"key"
:
"language_iso2"
},
b
"AB"
:
{
"type"
:
"hyperdata"
,
"key"
:
"abstract"
,
"separator"
:
" "
},
b
"WC"
:
{
"type"
:
"hyperdata"
,
"key"
:
"fields"
},
}
parsing/FileParsers/__init__.py
View file @
5160d178
from
.RisFileParser
import
RisFileParser
from
.RisFileParser
import
RisFileParser
from
.IsiFileParser
import
IsiFileParser
from
.IsiFileParser
import
IsiFileParser
from
.JstorFileParser
import
JstorFileParser
from
.JstorFileParser
import
JstorFileParser
from
.ZoteroFileParser
import
ZoteroFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.EuropressFileParser
import
EuropressFileParser
from
.EuropressFileParser
import
EuropressFileParser
from
.ISText
import
ISText
from
.ISText
import
ISText
...
...
parsing/corpustools.py
0 → 100644
View file @
5160d178
from
collections
import
defaultdict
from
datetime
import
datetime
from
random
import
random
from
hashlib
import
md5
from
time
import
time
from
math
import
log
from
gargantext_web.db
import
*
from
.parsers_config
import
parsers
as
_parsers
class
DebugTime
:
def
__init__
(
self
,
prefix
):
self
.
prefix
=
prefix
self
.
message
=
None
self
.
time
=
None
def
__del__
(
self
):
if
self
.
message
is
not
None
and
self
.
time
is
not
None
:
print
(
'
%
s -
%
s:
%.4
f'
%
(
self
.
prefix
,
self
.
message
,
time
()
-
self
.
time
))
def
show
(
self
,
message
):
self
.
__del__
()
self
.
message
=
message
self
.
time
=
time
()
# keep all the parsers in a cache
class
Parsers
(
defaultdict
):
def
__init__
(
self
):
self
.
_parsers
=
_parsers
def
__missing__
(
self
,
key
):
#print(self._parsers.keys())
if
key
not
in
self
.
_parsers
.
keys
():
raise
NotImplementedError
(
'No such parser: "
%
s"'
%
(
key
))
parser
=
self
.
_parsers
[
key
]()
self
[
key
]
=
parser
return
parser
parsers
=
Parsers
()
# resources management
def
add_resource
(
corpus
,
**
kwargs
):
# only for tests
session
=
Session
()
resource
=
Resource
(
guid
=
str
(
random
()),
**
kwargs
)
# User
if
'user_id'
not
in
kwargs
:
resource
.
user_id
=
corpus
.
user_id
# Compute the digest
h
=
md5
()
f
=
open
(
str
(
resource
.
file
),
'rb'
)
h
.
update
(
f
.
read
())
f
.
close
()
resource
.
digest
=
h
.
hexdigest
()
# check if a resource on this node already has this hash
tmp_resource
=
(
session
.
query
(
Resource
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
.
id
)
.
filter
(
Resource
.
digest
==
resource
.
digest
)
.
filter
(
Node_Resource
.
node_id
==
corpus
.
id
)
)
.
first
()
if
tmp_resource
is
not
None
:
return
tmp_resource
else
:
session
.
add
(
resource
)
session
.
commit
()
# link with the resource
node_resource
=
Node_Resource
(
node_id
=
corpus
.
id
,
resource_id
=
resource
.
id
,
parsed
=
False
,
)
session
.
add
(
node_resource
)
session
.
commit
()
# return result
return
resource
def
parse_resources
(
corpus
,
user
=
None
,
user_id
=
None
):
dbg
=
DebugTime
(
'Corpus #
%
d - parsing'
%
corpus
.
id
)
session
=
Session
()
corpus_id
=
corpus
.
id
type_id
=
cache
.
NodeType
[
'Document'
]
.
id
if
user_id
is
None
and
user
is
not
None
:
user_id
=
user
.
id
else
:
user_id
=
corpus
.
user_id
# find resource of the corpus
resources_query
=
(
session
.
query
(
Resource
,
ResourceType
)
.
join
(
ResourceType
,
ResourceType
.
id
==
Resource
.
type_id
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
.
id
)
.
filter
(
Node_Resource
.
node_id
==
corpus
.
id
)
.
filter
(
Node_Resource
.
parsed
==
False
)
)
# make a new node for every parsed document of the corpus
dbg
.
show
(
'analyze documents'
)
nodes
=
list
()
for
resource
,
resourcetype
in
resources_query
:
parser
=
parsers
[
resourcetype
.
name
]
for
hyperdata_dict
in
parser
.
parse
(
resource
.
file
):
# retrieve language ID from hyperdata
if
'language_iso2'
in
hyperdata_dict
:
try
:
language_id
=
cache
.
Language
[
hyperdata_dict
[
'language_iso2'
]]
.
id
except
KeyError
:
language_id
=
None
else
:
language_id
=
None
# create new node
node
=
Node
(
name
=
hyperdata_dict
.
get
(
'title'
,
''
)[:
200
],
parent_id
=
corpus_id
,
user_id
=
user_id
,
type_id
=
type_id
,
language_id
=
language_id
,
hyperdata
=
hyperdata_dict
,
date
=
datetime
.
utcnow
(),
)
nodes
.
append
(
node
)
#
# TODO: mark node-resources associations as parsed
#
dbg
.
show
(
'insert
%
d documents'
%
len
(
nodes
))
session
.
add_all
(
nodes
)
session
.
commit
()
# now, index the hyperdata
dbg
.
show
(
'insert hyperdata'
)
node_hyperdata_lists
=
defaultdict
(
list
)
hyperdata_types
=
{
hyperdata
.
name
:
hyperdata
for
hyperdata
in
session
.
query
(
Hyperdata
)
}
for
node
in
nodes
:
node_id
=
node
.
id
for
hyperdata_key
,
hyperdata_value
in
node
.
hyperdata
.
items
():
try
:
hyperdata
=
hyperdata_types
[
hyperdata_key
]
except
KeyError
:
# Why silent continue here ?
continue
if
hyperdata
.
type
==
'string'
:
hyperdata_value
=
hyperdata_value
[:
255
]
node_hyperdata_lists
[
hyperdata
.
type
]
.
append
((
node_id
,
hyperdata
.
id
,
hyperdata_value
,
))
for
key
,
values
in
node_hyperdata_lists
.
items
():
bulk_insert
(
Node_Hyperdata
,
[
'node_id'
,
'hyperdata_id'
,
'value_'
+
key
],
values
)
# mark the corpus as parsed
corpus
.
parsed
=
True
# ngrams extraction
from
.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
,
NgramsExtractor
class
NgramsExtractors
(
defaultdict
):
def
__init__
(
self
):
# English
self
[
'en'
]
=
EnglishNgramsExtractor
()
for
key
in
(
'eng'
,
'english'
):
self
[
key
]
=
self
[
'en'
]
# French
self
[
'fr'
]
=
FrenchNgramsExtractor
()
for
key
in
(
'fre'
,
'french'
):
self
[
key
]
=
self
[
'fr'
]
# default
self
[
'default'
]
=
NgramsExtractor
()
def
__missing__
(
self
,
key
):
formatted_key
=
key
.
strip
()
.
lower
()
if
formatted_key
in
self
:
self
[
key
]
=
self
[
formatted_key
]
else
:
self
[
key
]
=
self
[
'default'
]
# raise NotImplementedError
return
self
[
key
]
ngramsextractors
=
NgramsExtractors
()
def
extract_ngrams
(
corpus
,
keys
):
dbg
=
DebugTime
(
'Corpus #
%
d - ngrams'
%
corpus
.
id
)
default_language_iso2
=
None
if
corpus
.
language_id
is
None
else
cache
.
Language
[
corpus
.
language_id
]
.
iso2
# query the hyperdata associated with the given keys
columns
=
[
Node
.
id
,
Node
.
language_id
]
+
[
Node
.
hyperdata
[
key
]
for
key
in
keys
]
hyperdata_query
=
(
session
.
query
(
*
columns
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
)
# prepare data to be inserted
dbg
.
show
(
'find ngrams'
)
languages_by_id
=
{
language
.
id
:
language
.
iso2
for
language
in
session
.
query
(
Language
)
}
ngrams_data
=
set
()
ngrams_language_data
=
set
()
ngrams_tag_data
=
set
()
node_ngram_list
=
defaultdict
(
lambda
:
defaultdict
(
int
))
for
nodeinfo
in
hyperdata_query
:
node_id
=
nodeinfo
[
0
]
language_id
=
nodeinfo
[
1
]
if
language_id
is
None
:
language_iso2
=
default_language_iso2
else
:
language_iso2
=
languages_by_id
.
get
(
language_id
,
None
)
if
language_iso2
is
None
:
continue
ngramsextractor
=
ngramsextractors
[
language_iso2
]
for
text
in
nodeinfo
[
2
:]:
if
text
is
not
None
and
len
(
text
):
ngrams
=
ngramsextractor
.
extract_ngrams
(
text
.
replace
(
"["
,
""
)
.
replace
(
"]"
,
""
))
for
ngram
in
ngrams
:
n
=
len
(
ngram
)
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
lower
()
# TODO BUG here
if
n
==
1
:
#tag_id = cache.Tag[ngram[0][1]].id
tag_id
=
1
#print('tag_id', tag_id)
elif
n
>
1
:
tag_id
=
1
#tag_id = cache.Tag[ngram[0][1]].id
#tag_id = cache.Tag['NN'].id
#tag_id = 14
#print('tag_id_2', tag_id)
node_ngram_list
[
node_id
][
terms
]
+=
1
ngrams_data
.
add
((
n
,
terms
[:
255
]))
ngrams_language_data
.
add
((
terms
,
language_id
))
ngrams_tag_data
.
add
((
terms
,
tag_id
))
# insert ngrams to temporary table
dbg
.
show
(
'find ids for the
%
d ngrams'
%
len
(
ngrams_data
))
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
CREATE TEMPORARY TABLE tmp__ngrams (
id INT,
n INT NOT NULL,
terms VARCHAR(255) NOT NULL
)
'''
)
bulk_insert
(
'tmp__ngrams'
,
[
'n'
,
'terms'
],
ngrams_data
,
cursor
=
cursor
)
# retrieve ngram ids from already inserted stuff
cursor
.
execute
(
'''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%
s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
'''
%
(
Ngram
.
__table__
.
name
,
))
# insert, then get the ids back
cursor
.
execute
(
'''
INSERT INTO
%
s (n, terms)
SELECT
n, terms
FROM
tmp__ngrams
WHERE
id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,
))
cursor
.
execute
(
'''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%
s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
AND
tmp__ngrams.id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,
))
# get all ids
ngram_ids
=
dict
()
cursor
.
execute
(
'SELECT id, terms FROM tmp__ngrams'
)
for
row
in
cursor
.
fetchall
():
ngram_ids
[
row
[
1
]]
=
row
[
0
]
#
dbg
.
show
(
'insert associations'
)
node_ngram_data
=
list
()
for
node_id
,
ngrams
in
node_ngram_list
.
items
():
for
terms
,
weight
in
ngrams
.
items
():
try
:
ngram_id
=
ngram_ids
[
terms
]
node_ngram_data
.
append
((
node_id
,
ngram_id
,
weight
,
))
except
Exception
as
e
:
print
(
"err01:"
,
e
)
bulk_insert
(
Node_Ngram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
node_ngram_data
,
cursor
=
cursor
)
dbg
.
message
=
'insert
%
d associations'
%
len
(
node_ngram_data
)
# commit to database
db
.
commit
()
# tfidf calculation
def
compute_tfidf
(
corpus
):
dbg
=
DebugTime
(
'Corpus #
%
d - tfidf'
%
corpus
.
id
)
# compute terms frequency sum
dbg
.
show
(
'calculate terms frequencies sums'
)
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
CREATE TEMPORARY TABLE tmp__st (
node_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
'''
)
cursor
.
execute
(
'''
INSERT INTO
tmp__st (node_id, frequency)
SELECT
node_ngram.node_id,
SUM(node_ngram.weight) AS frequency
FROM
%
s AS node
INNER JOIN
%
s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id =
%
d
GROUP BY
node_ngram.node_id
'''
%
(
Node
.
__table__
.
name
,
Node_Ngram
.
__table__
.
name
,
corpus
.
id
,
))
# compute normalized terms frequencies
dbg
.
show
(
'normalize terms frequencies'
)
cursor
.
execute
(
'''
CREATE TEMPORARY TABLE tmp__tf (
node_id INT NOT NULL,
ngram_id INT NOT NULL,
frequency DOUBLE PRECISION NOT NULL
)
'''
)
cursor
.
execute
(
'''
INSERT INTO
tmp__tf (node_id, ngram_id, frequency)
SELECT
node_ngram.node_id,
node_ngram.ngram_id,
(node_ngram.weight / node.frequency) AS frequency
FROM
%
s AS node_ngram
INNER JOIN
tmp__st AS node ON node.node_id = node_ngram.node_id
'''
%
(
Node_Ngram
.
__table__
.
name
,
))
# show off
dbg
.
show
(
'compute idf'
)
cursor
.
execute
(
'''
CREATE TEMPORARY TABLE tmp__idf (
ngram_id INT NOT NULL,
idf DOUBLE PRECISION NOT NULL
)
'''
)
cursor
.
execute
(
'''
INSERT INTO
tmp__idf(ngram_id, idf)
SELECT
node_ngram.ngram_id,
-ln(COUNT(*))
FROM
%
s AS node
INNER JOIN
%
s AS node_ngram ON node_ngram.node_id = node.id
WHERE
node.parent_id =
%
d
GROUP BY
node_ngram.ngram_id
'''
%
(
Node
.
__table__
.
name
,
Node_Ngram
.
__table__
.
name
,
corpus
.
id
,
))
cursor
.
execute
(
'SELECT COUNT(*) FROM tmp__st'
)
D
=
cursor
.
fetchone
()[
0
]
if
D
>
0
:
lnD
=
log
(
D
)
cursor
.
execute
(
'UPDATE tmp__idf SET idf = idf +
%
f'
%
(
lnD
,
))
# show off
dbg
.
show
(
'insert tfidf for
%
d documents'
%
D
)
cursor
.
execute
(
'''
INSERT INTO
%
s (nodex_id, nodey_id, ngram_id, score)
SELECT
%
d AS nodex_id,
tf.node_id AS nodey_id,
tf.ngram_id AS ngram_id,
(tf.frequency * idf.idf) AS score
FROM
tmp__idf AS idf
INNER JOIN
tmp__tf AS tf ON tf.ngram_id = idf.ngram_id
'''
%
(
NodeNodeNgram
.
__table__
.
name
,
corpus
.
id
,
))
# # show off
# cursor.execute('''
# SELECT
# node.name,
# ngram.terms,
# node_node_ngram.score AS tfidf
# FROM
# %s AS node_node_ngram
# INNER JOIN
# %s AS node ON node.id = node_node_ngram.nodey_id
# INNER JOIN
# %s AS ngram ON ngram.id = node_node_ngram.ngram_id
# WHERE
# node_node_ngram.nodex_id = %d
# ORDER BY
# score DESC
# ''' % (NodeNodeNgram.__table__.name, Node.__table__.name, Ngram.__table__.name, corpus.id, ))
# for row in cursor.fetchall():
# print(row)
# the end!
db
.
commit
()
parsing/parsers_config.py
View file @
5160d178
...
@@ -4,12 +4,11 @@ parsers = {
...
@@ -4,12 +4,11 @@ parsers = {
'Pubmed (xml format)'
:
PubmedFileParser
,
'Pubmed (xml format)'
:
PubmedFileParser
,
'Web of Science (ISI format)'
:
IsiFileParser
,
'Web of Science (ISI format)'
:
IsiFileParser
,
'Scopus (RIS format)'
:
RisFileParser
,
'Scopus (RIS format)'
:
RisFileParser
,
'Zotero (RIS format)'
:
Jstor
FileParser
,
'Zotero (RIS format)'
:
Zotero
FileParser
,
'Jstor (RIS format)'
:
JstorFileParser
,
'Jstor (RIS format)'
:
JstorFileParser
,
#'Europress' : EuropressFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)'
:
EuropressFileParser
,
'Europress (French)'
:
EuropressFileParser
,
'Europress (English)'
:
EuropressFileParser
,
'Europress (English)'
:
EuropressFileParser
,
'CSVParser'
:
CSVParser
,
'CSVParser'
:
CSVParser
,
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment