Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
755a8d4d
Commit
755a8d4d
authored
Oct 29, 2014
by
Mathieu Rodic
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEATURE] The parser is working, directly from a Node instance!
parent
c50f2fff
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
169 additions
and
308 deletions
+169
-308
models.py
node/models.py
+35
-10
Caches.py
parsing/Caches.py
+23
-9
FileParser.py
parsing/FileParsers/FileParser.py
+41
-155
PubmedFileParser.py
parsing/FileParsers/PubmedFileParser.py
+26
-37
RisFileParser.py
parsing/FileParsers/RisFileParser.py
+18
-26
__init__.py
parsing/__init__.py
+0
-50
test-parsing_from_node.py
test-parsing_from_node.py
+26
-21
No files found.
node/models.py
View file @
755a8d4d
from
django.db
import
models
from
django.db
import
models
from
django.utils
import
timezone
from
django.utils
import
timezone
from
django.contrib.auth.models
import
User
from
django_hstore
import
hstore
from
django_hstore
import
hstore
from
cte_tree.models
import
CTENode
,
Manager
from
cte_tree.models
import
CTENode
,
Manager
#from cte_tree.fields import DepthField, PathField, OrderingField
#from cte_tree.fields import DepthField, PathField, OrderingField
from
parsing.Caches
import
LanguagesCache
from
parsing.FileParsers
import
*
from
time
import
time
from
time
import
time
from
collections
import
defaultdict
from
django.contrib.auth.models
import
User
from
collections
import
defaultdict
# Some usefull functions
# Some usefull functions
# TODO: start the function name with an underscore (private)
# TODO: start the function name with an underscore (private)
...
@@ -28,7 +31,7 @@ class Language(models.Model):
...
@@ -28,7 +31,7 @@ class Language(models.Model):
def
__str__
(
self
):
def
__str__
(
self
):
return
self
.
fullname
return
self
.
fullname
class
Databas
eType
(
models
.
Model
):
class
Resourc
eType
(
models
.
Model
):
name
=
models
.
CharField
(
max_length
=
255
)
name
=
models
.
CharField
(
max_length
=
255
)
def
__str__
(
self
):
def
__str__
(
self
):
return
self
.
name
return
self
.
name
...
@@ -40,7 +43,7 @@ class Ngram(models.Model):
...
@@ -40,7 +43,7 @@ class Ngram(models.Model):
class
Resource
(
models
.
Model
):
class
Resource
(
models
.
Model
):
guid
=
models
.
CharField
(
max_length
=
255
)
guid
=
models
.
CharField
(
max_length
=
255
)
bdd_type
=
models
.
ForeignKey
(
Databas
eType
,
blank
=
True
,
null
=
True
)
type
=
models
.
ForeignKey
(
Resourc
eType
,
blank
=
True
,
null
=
True
)
file
=
models
.
FileField
(
upload_to
=
upload_to
,
blank
=
True
)
file
=
models
.
FileField
(
upload_to
=
upload_to
,
blank
=
True
)
digest
=
models
.
CharField
(
max_length
=
32
)
# MD5 digest
digest
=
models
.
CharField
(
max_length
=
32
)
# MD5 digest
...
@@ -89,12 +92,33 @@ class Node(CTENode):
...
@@ -89,12 +92,33 @@ class Node(CTENode):
node_resource
.
save
()
node_resource
.
save
()
return
resource
return
resource
def
parse
(
self
):
def
parse
_resources
(
self
):
#
TODO: that's not very pretty...
#
parse all resources into a list of metadata
# can't we make a simple join in Django?
metadata_list
=
[]
for
node_resource
in
self
.
node_resource
.
filter
(
parsed
=
False
):
for
node_resource
in
self
.
node_resource
.
filter
(
parsed
=
False
):
# TODO: call parsers here
resource
=
node_resource
.
resource
print
(
node_resource
.
resource
.
file
)
parser
=
defaultdict
(
lambda
:
FileParser
.
FileParser
,
{
'pubmed'
:
PubmedFileParser
,
'isi'
:
IsiFileParser
,
'ris'
:
RisFileParser
,
'europress'
:
EuropressFileParser
,
})[
resource
.
type
.
name
]()
print
(
parser
)
metadata_list
+=
parser
.
parse
(
str
(
resource
.
file
))
# insert in the database!
type
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
langages_cache
=
LanguagesCache
()
Node
.
objects
.
bulk_create
([
Node
(
user
=
self
.
user
,
type
=
type
,
name
=
metadata
[
'title'
]
if
'title'
in
metadata
else
''
,
parent
=
self
,
language
=
langages_cache
[
metadata
[
'language_iso2'
]]
if
'language_iso2'
in
metadata
else
None
,
metadata
=
metadata
,
)
for
metadata
in
metadata_list
])
def
extract_ngrams
(
self
,
keys
,
cache
):
def
extract_ngrams
(
self
,
keys
,
cache
):
# what do we want from the cache?
# what do we want from the cache?
...
@@ -118,6 +142,7 @@ class Node(CTENode):
...
@@ -118,6 +142,7 @@ class Node(CTENode):
weight
=
weight
weight
=
weight
)
)
class
Node_Resource
(
models
.
Model
):
class
Node_Resource
(
models
.
Model
):
node
=
models
.
ForeignKey
(
Node
,
related_name
=
'node_resource'
)
node
=
models
.
ForeignKey
(
Node
,
related_name
=
'node_resource'
)
resource
=
models
.
ForeignKey
(
Resource
)
resource
=
models
.
ForeignKey
(
Resource
)
...
@@ -126,7 +151,7 @@ class Node_Resource(models.Model):
...
@@ -126,7 +151,7 @@ class Node_Resource(models.Model):
class
Node_Ngram
(
models
.
Model
):
class
Node_Ngram
(
models
.
Model
):
node
=
models
.
ForeignKey
(
Node
)
node
=
models
.
ForeignKey
(
Node
)
ngram
=
models
.
ForeignKey
(
Ngram
)
ngram
=
models
.
ForeignKey
(
Ngram
)
weight
=
models
.
Integer
Field
()
weight
=
models
.
Float
Field
()
class
Project
(
Node
):
class
Project
(
Node
):
class
Meta
:
class
Meta
:
...
...
parsing/Caches.py
View file @
755a8d4d
import
collections
import
node.models
from
node.models
import
Ngram
from
parsing.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
from
parsing.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
from
collections
import
defaultdict
class
NgramsCache
(
collections
.
defaultdict
):
class
NgramsCache
(
defaultdict
):
"""This allows the fast retrieval of ngram ids
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time.
from a cache instead of calling the database every time.
This class is language-specific."""
This class is language-specific."""
...
@@ -17,14 +17,14 @@ class NgramsCache(collections.defaultdict):
...
@@ -17,14 +17,14 @@ class NgramsCache(collections.defaultdict):
"""If the terms are not yet present in the dictionary,
"""If the terms are not yet present in the dictionary,
retrieve it from the database or insert it."""
retrieve it from the database or insert it."""
try
:
try
:
ngram
=
Ngram
.
get
(
terms
=
terms
,
language
=
self
.
language
)
ngram
=
node
.
models
.
Ngram
.
get
(
terms
=
terms
,
language
=
self
.
language
)
except
:
except
:
ngram
=
Ngram
(
terms
=
terms
,
n
=
len
(
terms
.
split
()),
language
=
self
.
language
)
ngram
=
node
.
models
.
Ngram
(
terms
=
terms
,
n
=
len
(
terms
.
split
()),
language
=
self
.
language
)
ngram
.
save
()
ngram
.
save
()
self
[
terms
]
=
ngram
self
[
terms
]
=
ngram
return
self
[
terms
]
return
self
[
terms
]
class
NgramsCaches
(
collections
.
defaultdict
):
class
NgramsCaches
(
defaultdict
):
"""This allows the fast retrieval of ngram ids
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time."""
from a cache instead of calling the database every time."""
def
__missing__
(
self
,
language
):
def
__missing__
(
self
,
language
):
...
@@ -33,7 +33,7 @@ class NgramsCaches(collections.defaultdict):
...
@@ -33,7 +33,7 @@ class NgramsCaches(collections.defaultdict):
self
[
language
]
=
NgramsCache
(
language
)
self
[
language
]
=
NgramsCache
(
language
)
return
self
[
language
]
return
self
[
language
]
class
NgramsExtractorsCache
(
collections
.
defaultdict
):
class
NgramsExtractorsCache
(
defaultdict
):
"""This allows the fast retrieval of ngram ids
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time."""
from a cache instead of calling the database every time."""
def
__missing__
(
self
,
key
):
def
__missing__
(
self
,
key
):
...
@@ -64,11 +64,25 @@ class NgramsExtractorsCache(collections.defaultdict):
...
@@ -64,11 +64,25 @@ class NgramsExtractorsCache(collections.defaultdict):
# return the proper extractor
# return the proper extractor
return
self
[
key
]
return
self
[
key
]
class
LanguagesCache
(
defaultdict
):
def
__init__
(
self
):
for
language
in
node
.
models
.
Language
.
objects
.
all
():
self
[
language
.
iso2
.
lower
()]
=
language
self
[
language
.
iso3
.
lower
()]
=
language
self
[
language
.
fullname
.
lower
()]
=
language
def
__missing__
(
self
,
key
):
betterKey
=
key
.
strip
()
.
lower
()
self
[
key
]
=
self
[
betterKey
]
if
betterKey
in
self
else
None
return
self
[
betterKey
]
class
Cache
:
class
Cache
s
:
"""This is THE cache of the caches.
"""This is THE cache of the caches.
See NgramsCaches and NgramsExtractorsCache for better understanding."""
See NgramsCaches and NgramsExtractorsCache for better understanding."""
def
__init__
(
self
):
def
__init__
(
self
):
self
.
ngrams
=
NgramsCaches
()
self
.
ngrams
=
NgramsCaches
()
self
.
extractors
=
NgramsExtractorsCache
()
self
.
extractors
=
NgramsExtractorsCache
()
self
.
languages
=
LanguagesCache
()
parsing/FileParsers/FileParser.py
View file @
755a8d4d
from
node.models
import
Node
,
NodeType
,
Language
,
Ngram
,
Node_Ngram
from
parsing.NgramsExtractors
import
*
import
collections
import
collections
import
dateutil.parser
import
dateutil.parser
import
zipfile
import
zipfile
from
parsing.Caches
import
LanguagesCache
class
FileParser
:
class
FileParser
:
"""Base class for performing files parsing depending on their type.
"""Base class for performing files parsing depending on their type.
"""
"""
def
__init__
(
self
,
language_cache
=
None
):
def
__init__
(
self
,
file
=
None
,
filepath
=
""
,
encoding
=
"utf8"
):
self
.
_languages_cache
=
LanguagesCache
()
if
language_cache
is
None
else
language_cache
# ...get the file item...
if
file
is
None
:
self
.
_file
=
open
(
filepath
,
"rb"
)
else
:
self
.
_file
=
file
# cache for ngrams
self
.
_ngramcaches
=
NgramCaches
()
# extractors
self
.
_extractors
=
dict
()
self
.
_document_nodetype
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
languages
=
Language
.
objects
.
all
()
self
.
_languages_fullname
=
{
language
.
fullname
.
lower
():
language
for
language
in
languages
}
self
.
_languages_iso2
=
{
language
.
iso2
.
lower
():
language
for
language
in
languages
}
self
.
_languages_iso3
=
{
language
.
iso3
.
lower
():
language
for
language
in
languages
}
def
extract_ngrams
(
self
,
text
,
language
):
"""Extract the ngrams from a given text.
"""
# Get the appropriate ngrams extractor, if it exists
if
language
not
in
self
.
_extractors
:
extractor
=
None
if
language
.
iso2
==
'en'
:
extractor
=
EnglishNgramsExtractor
()
elif
language
.
iso2
==
'fr'
:
extractor
=
FrenchNgramsExtractor
()
self
.
_extractors
[
language
]
=
extractor
else
:
extractor
=
self
.
_extractors
[
language
]
# Extract the ngrams
if
extractor
:
tokens
=
[]
for
ngram
in
extractor
.
extract_ngrams
(
text
):
ngram_text
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
tokens
.
append
(
ngram_text
)
return
collections
.
Counter
(
tokens
)
else
:
return
dict
()
def
create_document
(
self
,
parentNode
,
title
,
metadata
,
guid
=
None
):
"""Add a document to the database.
"""
metadata
=
self
.
format_metadata
(
metadata
)
# create or retrieve a resource for that document, based on its user id
# if guid is None:
# resource = Resource(guid=guid)
# else:
# try:
# resource = Resource.get(guid=guid)
# except:
# resource = Resource(guid=guid)
# # If the parent node already has a child with this resource, pass
# # (is it a good thing?)
# if parentNode.descendants().filter(resource=resource).exists():
# return None
# create the document itself
try
:
language
=
self
.
_languages_iso3
[
metadata
[
"language_iso3"
]]
except
:
language
=
None
childNode
=
Node
(
user
=
parentNode
.
user
,
type
=
self
.
_document_nodetype
,
name
=
title
,
language
=
language
,
metadata
=
metadata
,
#resource = resource,
parent
=
parentNode
)
childNode
.
save
()
return
childNode
def
detect_encoding
(
self
,
string
):
def
detect_encoding
(
self
,
string
):
"""Useful method to detect the document encoding.
"""Useful method to detect the document encoding.
"""
"""
pass
pass
def
_parse
(
self
,
parentNode
,
file
):
"""This method shall be overriden by inherited classes."""
return
list
()
def
parse
(
self
,
parentNode
,
file
=
None
):
"""Parse the files found in the file.
This method shall be overriden by inherited classes.
"""
if
file
is
None
:
with
transaction
.
atomic
():
self
.
parse
(
parentNode
,
self
.
_file
)
if
zipfile
.
is_zipfile
(
file
):
with
zipfile
.
ZipFile
(
file
)
as
zipArchive
:
for
filename
in
zipArchive
.
namelist
():
self
.
parse
(
parentNode
,
zipArchive
.
open
(
filename
,
"r"
))
else
:
self
.
_parse
(
parentNode
,
file
)
def
extract
(
self
,
parentNode
,
keys
):
"""Extract ngrams from the child nodes, given a list of field names."""
# get all the descendants of type "document"
childNodes
=
parentNode
.
descendants
()
.
filter
(
type
=
self
.
_document_nodetype
)
with
transaction
.
atomic
():
for
childNode
in
childNodes
:
# most importantly...
metadata
=
childNode
.
metadata
# which extractor shall we use?
if
language
not
in
self
.
_extractors
:
extractor
=
None
if
language
.
iso2
==
'en'
:
# use English
extractor
=
EnglishNgramsExtractor
()
elif
language
.
iso2
==
'fr'
:
# use French
extractor
=
FrenchNgramsExtractor
()
else
:
# no recognized language has been specified...
continue
self
.
_extractors
[
language
]
=
extractor
# extract ngrams from every field, find the id, count them
ngrams
=
collections
.
defaultdict
(
int
)
ngramscache
=
self
.
_ngramcaches
[
language
]
for
key
in
keys
:
for
ngram
in
extractor
.
extract_ngrams
(
text
):
ngram_text
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
ngram_id
=
ngramscache
[
ngramtext
]
.
id
ngrams
[
ngram_id
]
+=
1
# insert node/ngram associations in the database
for
ngram_id
,
occurences
in
ngrams
.
items
():
Node_Ngram
(
node_id
=
childNode
.
id
,
ngram_id
=
ngram_id
,
occurences
=
occurences
)
.
save
()
def
format_metadata_dates
(
self
,
metadata
):
def
format_metadata_dates
(
self
,
metadata
):
"""Format the dates found in the metadata.
"""Format the dates found in the metadata.
Examples:
Examples:
{"publication_date": "2014-10-23 09:57:42"}
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
"""
# First, check the split dates...
# First, check the split dates...
...
@@ -189,31 +62,24 @@ class FileParser:
...
@@ -189,31 +62,24 @@ class FileParser:
metadata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
M"
)
metadata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
M"
)
metadata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
S"
)
metadata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
S"
)
# finally, return the result!
# finally, return the
transformed
result!
return
metadata
return
metadata
def
format_metadata_languages
(
self
,
metadata
):
def
format_metadata_languages
(
self
,
metadata
):
"""format the languages found in the metadata."""
"""format the languages found in the metadata."""
try
:
language
=
None
if
"language_fullname"
in
metadata
:
for
key
in
[
"fullname"
,
"iso3"
,
"iso2"
]:
language
=
self
.
_languages_fullname
[
metadata
[
"language_fullname"
]
.
lower
()]
if
key
in
metadata
:
elif
"language_iso3"
in
metadata
:
language_symbol
=
metadata
[
"language_"
+
key
]
language
=
self
.
_languages_iso3
[
metadata
[
"language_iso3"
]
.
lower
()]
language
=
self
.
_languages_cache
[
language_symbol
]
elif
"language_iso2"
in
metadata
:
if
language
:
language
=
self
.
_languages_iso2
[
metadata
[
"language_iso2"
]
.
lower
()]
break
else
:
if
language
:
return
metadata
except
KeyError
:
# the language has not been found
for
key
in
[
"language_fullname"
,
"language_iso3"
,
"language_iso2"
]:
try
:
metadata
.
pop
(
key
)
except
:
continue
return
metadata
metadata
[
"language_iso2"
]
=
language
.
iso2
metadata
[
"language_iso2"
]
=
language
.
iso2
metadata
[
"language_iso3"
]
=
language
.
iso3
metadata
[
"language_iso3"
]
=
language
.
iso3
metadata
[
"language_fullname"
]
=
language
.
fullname
metadata
[
"language_fullname"
]
=
language
.
fullname
return
metadata
return
metadata
def
format_metadata
(
self
,
metadata
):
def
format_metadata
(
self
,
metadata
):
"""Format the metadata."""
"""Format the metadata."""
metadata
=
self
.
format_metadata_dates
(
metadata
)
metadata
=
self
.
format_metadata_dates
(
metadata
)
...
@@ -221,3 +87,23 @@ class FileParser:
...
@@ -221,3 +87,23 @@ class FileParser:
return
metadata
return
metadata
def
_parse
(
self
,
file
):
"""This method shall be overriden by inherited classes."""
return
list
()
def
parse
(
self
,
file
):
"""Parse the file, and its children files found in the file.
"""
# initialize the list of metadata
metadata_list
=
[]
# is the file is a ZIP archive, recurse on each of its files...
if
zipfile
.
is_zipfile
(
file
):
zipArchive
=
zipfile
.
ZipFile
(
file
)
for
filename
in
zipArchive
.
namelist
():
metadata_list
+=
self
.
parse
(
zipArchive
.
open
(
filename
,
"r"
))
# ...otherwise, let's parse it directly!
else
:
metadata_list
+=
self
.
_parse
(
file
)
# return the list of formatted metadata
return
map
(
self
.
format_metadata
,
metadata_list
)
parsing/FileParsers/PubmedFileParser.py
View file @
755a8d4d
...
@@ -5,16 +5,14 @@ from parsing.NgramsExtractors import *
...
@@ -5,16 +5,14 @@ from parsing.NgramsExtractors import *
class
PubmedFileParser
(
FileParser
):
class
PubmedFileParser
(
FileParser
):
def
_parse
(
self
,
parentNode
,
file
):
def
_parse
(
self
,
file
):
# open the file as XML
# open the file as XML
xml_parser
=
etree
.
XMLParser
(
resolve_entities
=
False
,
recover
=
True
)
xml_parser
=
etree
.
XMLParser
(
resolve_entities
=
False
,
recover
=
True
)
xml
=
etree
.
parse
(
file
,
parser
=
xml_parser
)
xml
=
etree
.
parse
(
file
,
parser
=
xml_parser
)
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
xml_articles
=
xml
.
findall
(
'PubmedArticle'
)
with
transaction
.
atomic
():
# initialize the list of metadata
# initialize the list of documents
metadata_list
=
[]
documents
=
[]
# parse all the articles, one by one
# parse all the articles, one by one
# all database operations should be performed within one transaction
for
xml_article
in
xml_articles
:
for
xml_article
in
xml_articles
:
# extract data from the document
# extract data from the document
metadata
=
{}
metadata
=
{}
...
@@ -34,15 +32,6 @@ class PubmedFileParser(FileParser):
...
@@ -34,15 +32,6 @@ class PubmedFileParser(FileParser):
metadata
[
key
]
=
node
.
text
metadata
[
key
]
=
node
.
text
except
:
except
:
metadata
[
key
]
=
""
metadata
[
key
]
=
""
contents
=
metadata
[
"abstract"
]
metadata_list
.
append
(
metadata
)
# create the document in the database
# return the list of metadata
document
=
self
.
create_document
(
return
metadata_list
parentNode
=
parentNode
,
title
=
metadata
[
"title"
],
metadata
=
metadata
,
#guid = metadata["doi"],
)
if
document
:
documents
.
append
(
document
)
# return the list of documents
return
documents
parsing/FileParsers/RisFileParser.py
View file @
755a8d4d
...
@@ -7,11 +7,11 @@ class RisFileParser(FileParser):
...
@@ -7,11 +7,11 @@ class RisFileParser(FileParser):
_parameters
=
{
_parameters
=
{
}
}
def
_parse
(
self
,
parentNode
,
file
):
def
_parse
(
self
,
file
):
metadata_list
=
[]
metadata
=
{}
metadata
=
{}
last_key
=
None
last_key
=
None
last_values
=
[]
last_values
=
[]
with
transaction
.
atomic
():
for
line
in
self
.
_file
:
for
line
in
self
.
_file
:
if
len
(
line
)
>
2
:
if
len
(
line
)
>
2
:
parameter_key
=
line
[:
2
]
parameter_key
=
line
[:
2
]
...
@@ -23,17 +23,9 @@ class RisFileParser(FileParser):
...
@@ -23,17 +23,9 @@ class RisFileParser(FileParser):
metadata
[
parameter
[
"key"
]]
=
separator
.
join
(
last_values
)
metadata
[
parameter
[
"key"
]]
=
separator
.
join
(
last_values
)
elif
parameter
[
"type"
]
==
"delimiter"
:
elif
parameter
[
"type"
]
==
"delimiter"
:
language
=
self
.
_languages_fullname
[
metadata
[
"language"
]
.
lower
()]
language
=
self
.
_languages_fullname
[
metadata
[
"language"
]
.
lower
()]
# self.create_document(
metadata_list
.
append
(
metadata
)
# parentNode = parentNode,
# title = metadata["title"],
# metadata = metadata,
# guid = metadata["doi"]
# )
print
(
self
.
format_metadata
(
metadata
))
print
()
metadata
=
{}
last_key
=
parameter_key
last_key
=
parameter_key
last_values
=
[]
last_values
=
[]
last_values
.
append
(
line
[
3
:
-
1
]
.
decode
())
last_values
.
append
(
line
[
3
:
-
1
]
.
decode
())
self
.
_file
.
close
()
return
metadata_list
parsing/__init__.py
deleted
100644 → 0
View file @
c50f2fff
#from .Taggers import *
#from .NgramsExtractors import *
from
.FileParsers
import
*
from
node.models
import
Node
,
NodeType
import
zipfile
import
collections
# import chardet
class
Parser
:
def
__init__
(
self
):
pass
def
parse_file
(
self
,
file
):
# CHECKER GUID!!!!!!!!!!!!!!!!!!!!!!!!!!!!
pass
def
parse_node_fichier
(
self
,
node
):
if
node
.
fichier
and
zipfile
.
is_zipfile
(
node
.
fichier
):
with
zipfile
.
ZipFile
(
node
.
fichier
,
"r"
)
as
zipFile
:
node_type
=
NodeType
.
objects
.
get
(
name
=
"Document"
)
for
filename
in
zipFile
.
namelist
():
file
=
zipFile
.
open
(
filename
,
"r"
)
node
.
objects
.
create
(
parent
=
node
,
type
=
node_type
,
user
=
node
.
user
,
)
def
parse_node
(
self
,
node
):
for
resource
in
node
.
resources
:
if
node
.
resources
.
file
and
zipfile
.
is_zipfile
(
node
.
resources
.
file
):
with
zipfile
.
ZipFile
(
node
.
resources
.
file
,
"r"
)
as
zipFile
:
for
filename
in
zipFile
.
namelist
():
file
=
zipFile
.
open
(
filename
,
"r"
)
Node
.
objects
.
create
(
parent
=
node
,
type
=
NodeType
.
get
(
name
=
"Document"
),
user
=
node
.
user
,
)
def
parse_node_recursively
(
self
,
node
):
self
.
parse_node
(
node
)
for
descendant
in
node
.
get_descendants
():
self
.
parse_node
(
descendant
)
test-parsing_from_node.py
View file @
755a8d4d
from
node.models
import
Node
,
NodeType
,
User
,
Language
from
node.models
import
Node
,
NodeType
,
User
,
Language
,
ResourceType
from
parsing.Caches
import
Cache
from
parsing.Caches
import
Cache
s
try
:
try
:
me
=
User
.
objects
.
get
(
username
=
'Mat'
)
me
=
User
.
objects
.
get
(
username
=
'Mat'
)
...
@@ -7,6 +7,12 @@ except:
...
@@ -7,6 +7,12 @@ except:
me
=
User
(
username
=
'Mat'
)
me
=
User
(
username
=
'Mat'
)
me
.
save
()
me
.
save
()
try
:
typePubmed
=
ResourceType
.
get
(
name
=
'pubmed'
)
except
:
typePubmed
=
ResourceType
(
name
=
'pubmed'
)
typePubmed
.
save
()
try
:
try
:
typeCorpus
=
NodeType
.
get
(
name
=
'corpus'
)
typeCorpus
=
NodeType
.
get
(
name
=
'corpus'
)
typeDoc
=
NodeType
.
get
(
name
=
'document'
)
typeDoc
=
NodeType
.
get
(
name
=
'document'
)
...
@@ -25,25 +31,24 @@ try:
...
@@ -25,25 +31,24 @@ try:
except
:
except
:
corpus
=
Node
(
name
=
'My first corpus'
,
type
=
typeCorpus
,
user
=
me
)
corpus
=
Node
(
name
=
'My first corpus'
,
type
=
typeCorpus
,
user
=
me
)
corpus
.
save
()
corpus
.
save
()
for
i
in
range
(
64
):
# for i in range(64):
title
=
'Document #
%
d'
%
i
# title = 'Document #%d' % i
Node
(
# Node(
user
=
me
,
# user = me,
# type = self._document_nodetype,
# # type = self._document_nodetype,
name
=
title
,
# name = title,
language
=
english
,
# language = english,
metadata
=
{
'title'
:
title
},
# metadata = {'title':title},
#resource = resource,
# #resource = resource,
type
=
typeDoc
,
# type = typeDoc,
parent
=
corpus
# parent = corpus
)
.
save
()
# ).save()
corpus
.
add_resource
(
file
=
'/path/to/file'
)
corpus
.
children
.
all
()
.
delete
()
corpus
.
parse
()
corpus
.
add_resource
(
file
=
'./data_samples/pubmed.zip'
,
type
=
typePubmed
)
corpus
.
parse_resources
()
exit
()
cache
=
Caches
()
cache
=
Cache
()
for
child
in
corpus
.
children
.
all
():
for
child
in
corpus
.
children
.
all
():
print
(
child
.
id
)
print
(
'#
%
d
\t
%
s
\n
%
s
\n\n
'
%
(
child
.
id
,
child
.
name
,
child
.
metadata
[
'abstract'
]))
child
.
extract_ngrams
([
'title'
],
cache
)
# child.extract_ngrams(['title'], cache)
\ No newline at end of file
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment