Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
2fbda243
Commit
2fbda243
authored
Mar 24, 2015
by
Administrator
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] Ngrams with language and tags manyToMany Fields.
parent
1283a347
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
95 additions
and
8 deletions
+95
-8
part_of_speech_labels.py
init/part_of_speech_labels.py
+14
-0
part_of_speech_labels.txt
init/part_of_speech_labels.txt
+37
-0
models.py
node/models.py
+18
-4
corpustools.py
parsing/corpustools.py
+26
-4
No files found.
init/part_of_speech_labels.py
0 → 100644
View file @
2fbda243
from
gargantext_web.db
import
*
# Instantiante table NgramTag:
f
=
open
(
"/srv/gargantext/init/part_of_speech_labels.txt"
,
'r'
)
for
line
in
f
.
readlines
():
name
,
description
=
line
.
strip
()
.
split
(
'
\t
'
)
_tag
=
Tag
(
name
=
name
,
description
=
description
)
session
.
add
(
_tag
)
session
.
commit
()
f
.
close
()
init/part_of_speech_labels.txt
0 → 100644
View file @
2fbda243
CC Coordinating conjunction
CD Cardinal number
DT Determiner
EX Existential there
FW Foreign word
IN Preposition or subordinating conjunction
JJ Adjective
JJR Adjective, comparative
JJS Adjective, superlative
LS List item marker
MD Modal
NN Noun, singular or mass
NNS Noun, plural
NNP Proper noun, singular
NNPS Proper noun, plural
PDT Predeterminer
POS Possessive ending
PRP Personal pronoun
PRP$ Possessive pronoun
RB Adverb
RBR Adverb, comparative
RBS Adverb, superlative
RP Particle
SYM Symbol
TO to
UH Interjection
VB Verb, base form
VBD Verb, past tense
VBG Verb, gerund or present participle
VBN Verb, past participle
VBP Verb, non3rd person singular present
VBZ Verb, 3rd person singular present
WDT Whdeterminer
WP Whpronoun
WP$ Possessive whpronoun
WRB Whadverb
NGRA Ngram
node/models.py
View file @
2fbda243
...
...
@@ -52,20 +52,34 @@ class ResourceType(models.Model):
def
__str__
(
self
):
return
self
.
name
class
Ngram
Tag
(
models
.
Model
):
tag
=
models
.
CharField
(
max_length
=
4
,
unique
=
True
)
class
Tag
(
models
.
Model
):
name
=
models
.
CharField
(
max_length
=
4
,
unique
=
True
)
description
=
models
.
CharField
(
max_length
=
255
,
unique
=
True
)
class
Ngram
(
models
.
Model
):
language
=
models
.
ForeignKey
(
Language
,
blank
=
True
,
null
=
True
,
on_delete
=
models
.
SET_NULL
)
language
=
models
.
ManyToManyField
(
blank
=
True
,
null
=
True
,
through
=
'NgramLanguage'
,
to
=
'Language'
)
n
=
models
.
IntegerField
()
terms
=
models
.
CharField
(
max_length
=
255
,
unique
=
True
)
nodes
=
models
.
ManyToManyField
(
through
=
'Node_Ngram'
,
to
=
'Node'
)
tag
=
models
.
ForeignKey
(
NgramTag
,
blank
=
True
,
null
=
True
)
tag
=
models
.
ManyToManyField
(
blank
=
True
,
null
=
True
,
through
=
'NgramTag'
,
to
=
'Tag'
)
def
__str__
(
self
):
return
self
.
terms
class
NgramTag
(
models
.
Model
):
ngram
=
models
.
ForeignKey
(
Ngram
,
on_delete
=
models
.
CASCADE
)
tag
=
models
.
ForeignKey
(
Tag
)
def
__str__
(
self
):
return
"
%
s:
%
s"
%
(
self
.
ngram
.
terms
,
self
.
tag
.
name
)
class
NgramLanguage
(
models
.
Model
):
ngram
=
models
.
ForeignKey
(
Ngram
,
on_delete
=
models
.
CASCADE
)
language
=
models
.
ForeignKey
(
Language
)
def
__str__
(
self
):
return
"
%
s:
%
s"
%
(
self
.
ngram
.
terms
,
self
.
language
.
fullname
)
class
Resource
(
models
.
Model
):
user
=
models
.
ForeignKey
(
User
)
...
...
parsing/corpustools.py
View file @
2fbda243
...
...
@@ -211,7 +211,11 @@ def extract_ngrams(corpus, keys):
language
.
id
:
language
.
iso2
for
language
in
session
.
query
(
Language
)
}
ngrams_data
=
set
()
ngrams_language_data
=
set
()
ngrams_tag_data
=
set
()
node_ngram_list
=
defaultdict
(
lambda
:
defaultdict
(
int
))
for
nodeinfo
in
metadata_query
:
node_id
=
nodeinfo
[
0
]
...
...
@@ -227,12 +231,25 @@ def extract_ngrams(corpus, keys):
if
text
is
not
None
and
len
(
text
):
ngrams
=
ngramsextractor
.
extract_ngrams
(
text
.
replace
(
"["
,
""
)
.
replace
(
"]"
,
""
))
for
ngram
in
ngrams
:
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
lower
()
n
=
len
(
ngram
)
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
lower
()
# TODO BUG here
if
n
==
1
:
tag_id
=
cache
.
Tag
[
ngram
[
0
][
1
]]
.
id
#tag_id = 1
#print('tag_id', tag_id)
elif
n
>
1
:
tag_id
=
cache
.
Tag
[
'NN'
]
.
id
#tag_id = 14
#print('tag_id_2', tag_id)
node_ngram_list
[
node_id
][
terms
]
+=
1
ngrams_data
.
add
(
(
n
,
terms
)
)
ngrams_data
.
add
((
n
,
terms
))
ngrams_language_data
.
add
((
terms
,
language_id
))
ngrams_tag_data
.
add
((
terms
,
tag_id
))
# insert ngrams to temporary table
dbg
.
show
(
'find ids for the
%
d ngrams'
%
len
(
ngrams_data
))
db
,
cursor
=
get_cursor
()
...
...
@@ -256,6 +273,7 @@ def extract_ngrams(corpus, keys):
ngram.terms = tmp__ngrams.terms
'''
%
(
Ngram
.
__table__
.
name
,
))
# insert, then get the ids back
cursor
.
execute
(
'''
INSERT INTO
%
s (n, terms)
...
...
@@ -266,6 +284,8 @@ def extract_ngrams(corpus, keys):
WHERE
id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,
))
cursor
.
execute
(
'''
UPDATE
tmp__ngrams
...
...
@@ -278,11 +298,13 @@ def extract_ngrams(corpus, keys):
AND
tmp__ngrams.id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,
))
# get all ids
ngram_ids
=
dict
()
cursor
.
execute
(
'SELECT id, terms FROM tmp__ngrams'
)
for
row
in
cursor
.
fetchall
():
ngram_ids
[
row
[
1
]]
=
row
[
0
]
#
dbg
.
show
(
'insert associations'
)
node_ngram_data
=
list
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment