Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
e6f6eca9
Commit
e6f6eca9
authored
Oct 19, 2014
by
Mathieu Rodic
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
More corrections
parent
07a6f374
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
56 additions
and
6 deletions
+56
-6
EuropressFileParser.py
mat-parsing/FileParsers/EuropressFileParser.py
+5
-3
FileParser.py
mat-parsing/FileParsers/FileParser.py
+2
-1
IsiFileParser.py
mat-parsing/FileParsers/IsiFileParser.py
+43
-0
PubmedFileParser.py
mat-parsing/FileParsers/PubmedFileParser.py
+6
-2
No files found.
mat-parsing/FileParsers/EuropressFileParser.py
View file @
e6f6eca9
from
django.db
import
transaction
from
FileParser
import
FileParser
class
EuropressFileParser
(
FileParser
,
contents
):
class
EuropressFileParser
(
FileParser
):
def
parse
:
def
parse
(
self
,
parentNode
)
:
pass
pass
\ No newline at end of file
mat-parsing/FileParsers/FileParser.py
View file @
e6f6eca9
...
@@ -84,7 +84,8 @@ class FileParser:
...
@@ -84,7 +84,8 @@ class FileParser:
resource
=
Resource
(
guid
=
guid
)
resource
=
Resource
(
guid
=
guid
)
# If the parent node already has a child with this resource, pass
# If the parent node already has a child with this resource, pass
# (is it a good thing?)
# (is it a good thing?)
if
parentNode
.
get_descendants
()
.
if
parentNode
.
get_descendants
()
.
filter
(
resource
=
resource
)
.
exists
():
return
None
# create the document itself
# create the document itself
childNode
=
Node
(
childNode
=
Node
(
user
=
parentNode
.
pk
,
user
=
parentNode
.
pk
,
...
...
mat-parsing/FileParsers/IsiFileParser.py
0 → 100644
View file @
e6f6eca9
from
django.db
import
transaction
from
FileParser
import
FileParser
class
IsiFileParser
(
FileParser
):
def
parse
(
self
,
parentNode
):
# read the file, line by line
for
line
in
self
.
__file
:
# open the file as XML
xml_parser
=
etree
.
XMLParser
(
resolve_entities
=
False
,
recover
=
True
)
xml
=
etree
.
parse
(
self
.
_file
,
parser
=
xml_parser
)
# parse all the articles, one by one
# all database operations should be performed within one transaction
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
with
transaction
.
atomic
():
for
xml_article
in
xml_articles
:
# extract data from the document
date_year
=
int
(
xml_article
.
find
(
'MedlineCitation/DateCreated/Year'
)
.
text
)
date_month
=
int
(
xml_article
.
find
(
'MedlineCitation/DateCreated/Month'
)
.
text
)
date_day
=
int
(
xml_article
.
find
(
'MedlineCitation/DateCreated/Day'
)
.
text
)
metadata
=
{
# other metadata should also be included:
# authors, submission date, etc.
"date_pub"
:
datetime
.
date
(
year
,
month
,
day
),
"journal"
:
xml_article
.
find
(
'MedlineCitation/Article/Journal/Title'
)
.
text
"title"
:
xml_article
.
find
(
'MedlineCitation/Article/ArticleTitle'
)
.
text
"language_iso3"
:
xml_article
.
find
(
'MedlineCitation/Article/Language'
)
.
text
"doi"
:
xml_article
.
find
(
'PubmedData/ArticleIdList/ArticleId[type=doi]'
)
.
text
}
contents
=
xml_article
.
find
(
'MedlineCitation/Article/Abstract/AbstractText'
)
.
text
# create the document in the database
yield
self
.
create_document
(
parentNode
=
parentNode
title
=
metadata
[
"title"
],
contents
=
contents
,
language
=
self
.
_languages_iso3
[
metadata
[
"language"
]
.
lower
()]
metadata
=
metadata
,
guid
=
metadata
[
"doi"
],
)
mat-parsing/FileParsers/PubmedFileParser.py
View file @
e6f6eca9
...
@@ -6,11 +6,12 @@ class PubmedFileParser(FileParser):
...
@@ -6,11 +6,12 @@ class PubmedFileParser(FileParser):
def
parse
(
self
,
parentNode
):
def
parse
(
self
,
parentNode
):
# open the file as XML
# open the file as XML
xml_parser
=
etree
.
XMLParser
(
resolve_entities
=
False
,
recover
=
True
)
xml_parser
=
etree
.
XMLParser
(
resolve_entities
=
False
,
recover
=
True
)
xml
=
etree
.
parse
(
self
.
_file
,
parser
=
xml_parser
)
xml
=
etree
.
parse
(
self
.
_file
,
parser
=
xml_parser
)
# parse all the articles, one by one
# parse all the articles, one by one
# all database operations should be performed within one transaction
# all database operations should be performed within one transaction
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
documents
=
[]
with
transaction
.
atomic
():
with
transaction
.
atomic
():
for
xml_article
in
xml_articles
:
for
xml_article
in
xml_articles
:
# extract data from the document
# extract data from the document
...
@@ -28,7 +29,7 @@ class PubmedFileParser(FileParser):
...
@@ -28,7 +29,7 @@ class PubmedFileParser(FileParser):
}
}
contents
=
xml_article
.
find
(
'MedlineCitation/Article/Abstract/AbstractText'
)
.
text
contents
=
xml_article
.
find
(
'MedlineCitation/Article/Abstract/AbstractText'
)
.
text
# create the document in the database
# create the document in the database
yield
self
.
create_document
(
document
=
self
.
create_document
(
parentNode
=
parentNode
parentNode
=
parentNode
title
=
metadata
[
"title"
],
title
=
metadata
[
"title"
],
contents
=
contents
,
contents
=
contents
,
...
@@ -36,3 +37,6 @@ class PubmedFileParser(FileParser):
...
@@ -36,3 +37,6 @@ class PubmedFileParser(FileParser):
metadata
=
metadata
,
metadata
=
metadata
,
guid
=
metadata
[
"doi"
],
guid
=
metadata
[
"doi"
],
)
)
if
document
:
documents
.
append
(
document
)
return
documents
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment