Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
GarganTexternal tools
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
GarganTexternal tools
Commits
5635c954
Commit
5635c954
authored
1 year ago
by
Anne-Laure Thomas Derepas
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev' into 'master'
Dev See merge request
athomas/gargantexternal-tools!13
parents
41e00418
22967fb8
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
494 additions
and
116 deletions
+494
-116
Istex2ggtx.py
Conversion/ToTSV/IsTexToGarganText/Istex2ggtx.py
+88
-0
README.md
Conversion/ToTSV/IsTexToGarganText/README.md
+15
-0
istex-subset-2023-07-17.zip
...oTSV/IsTexToGarganText/sample/istex-subset-2023-07-17.zip
+0
-0
istex-subset-2023-07-19.zip
...oTSV/IsTexToGarganText/sample/istex-subset-2023-07-19.zip
+0
-0
README.md
Conversion/ToTSV/ZoteroToGarganText/README.md
+11
-0
ZoteroToGarganText.py
Conversion/ToTSV/ZoteroToGarganText/ZoteroToGarganText.py
+95
-0
IsidoreAPIToGarganText.py
Conversion/ToTSV/isidoreToTSV/IsidoreAPIToGarganText.py
+102
-0
README.md
Conversion/ToTSV/isidoreToTSV/README.md
+38
-0
pages.toml
Streamlit/.streamlit/pages.toml
+4
-0
text_GEXFToTermOcc.csv
Streamlit/lang/text_GEXFToTermOcc.csv
+15
-0
text_IsidoreToGarganText.csv
Streamlit/lang/text_IsidoreToGarganText.csv
+5
-5
text_IstexToGarganText.csv
Streamlit/lang/text_IstexToGarganText.csv
+1
-1
text_PubMedToGarganText.csv
Streamlit/lang/text_PubMedToGarganText.csv
+2
-2
text_YTBtoTSV.csv
Streamlit/lang/text_YTBtoTSV.csv
+1
-1
text_ZoteroToGarganText.csv
Streamlit/lang/text_ZoteroToGarganText.csv
+2
-2
Clean_CSV_to_TSV.py
Streamlit/pages/Clean_CSV_to_TSV.py
+57
-92
GEXF_To_TermOcc.py
Streamlit/pages/GEXF_To_TermOcc.py
+43
-0
HAL_To_GarganText.py
Streamlit/pages/HAL_To_GarganText.py
+2
-2
Isidore_To_GarganText.py
Streamlit/pages/Isidore_To_GarganText.py
+8
-6
Istex_To_GarganText.py
Streamlit/pages/Istex_To_GarganText.py
+2
-4
PDF_to_TSV.py
Streamlit/pages/PDF_to_TSV.py
+3
-1
No files found.
Conversion/ToTSV/IsTexToGarganText/Istex2ggtx.py
0 → 100644
View file @
5635c954
#!/usr/bin/env python
# coding: utf-8
# In[74]:
import
json
import
pandas
as
pd
import
numpy
as
np
import
glob
import
sys
import
datetime
# In[80]:
import
zipfile
# In[83]:
input_file
=
sys
.
argv
[
1
]
output_file
=
sys
.
argv
[
2
]
# In[84]:
#list_articles=glob.glob("tmp/*/*.json")
# In[85]:
output
=
[]
with
zipfile
.
ZipFile
(
input_file
,
'r'
)
as
zip_ref
:
for
file
in
zip_ref
.
namelist
():
if
file
.
split
(
'.'
)[
1
]
!=
'json'
or
file
.
split
(
'.'
)[
0
]
==
'manifest'
:
continue
try
:
article
=
json
.
load
(
zip_ref
.
open
(
file
))
temp
=
{}
temp
[
"title"
]
=
article
.
get
(
"title"
,
""
)
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
temp
[
"abstract"
]
=
article
.
get
(
"abstract"
,
""
)
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
authors
=
""
for
author
in
article
.
get
(
"author"
,[]):
authors
+=
author
[
"name"
]
+
", "
authors
=
authors
[:
-
2
]
temp
[
"code"
]
=
article
.
get
(
"_id"
)
temp
[
"authors"
]
=
authors
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
.
replace
(
","
,
";"
)
temp
[
"source"
]
=
article
[
"host"
][
"title"
]
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
temp
[
"publication_year"
]
=
article
.
get
(
"publicationDate"
,
datetime
.
date
.
today
()
.
year
)
temp
[
"publication_month"
]
=
1
temp
[
"publication_day"
]
=
1
output
.
append
(
temp
)
except
Exception
as
e
:
print
(
file
,
e
)
# In[86]:
output
=
pd
.
DataFrame
(
output
)
tmp
=
output
.
size
duplicated
=
output
[
'title'
]
.
str
.
lower
()
.
replace
(
","
,
""
,
regex
=
True
)
.
duplicated
()
if
(
duplicated
.
any
()):
print
(
"
\n
Quelques fichiers n'ont pas été introduits dans le TSV car ils pourraient apparaitre plusieurs fois:"
)
for
i
in
range
(
0
,
output
[
"title"
]
.
size
-
1
):
if
(
duplicated
[
i
]):
print
(
"
\t
"
+
output
[
"code"
][
i
]
+
" "
+
output
[
"title"
][
i
])
output
.
drop
([
'code'
],
axis
=
1
)
output
=
output
[
~
duplicated
]
# In[87]:
output
.
to_csv
(
output_file
,
sep
=
'
\t
'
,
index
=
False
)
print
(
""
)
# In[ ]:
This diff is collapsed.
Click to expand it.
Conversion/ToTSV/IsTexToGarganText/README.md
0 → 100644
View file @
5635c954
# IstexToGargantext
## About The project
IstexToGargantext convert a zip file from Istex into a TSV file for GarganText
## Usage
```
shell
python3 Istex2ggtx.py file.zip
```
## Date
This script have been last updated the 2023/07/24.
It can be outdated if the futur.
This diff is collapsed.
Click to expand it.
Conversion/ToTSV/IsTexToGarganText/sample/istex-subset-2023-07-17.zip
0 → 100644
View file @
5635c954
File added
This diff is collapsed.
Click to expand it.
Conversion/ToTSV/IsTexToGarganText/sample/istex-subset-2023-07-19.zip
0 → 100644
View file @
5635c954
File added
This diff is collapsed.
Click to expand it.
Conversion/ToTSV/ZoteroToGarganText/README.md
0 → 100644
View file @
5635c954
# ZoteroToGargantext
## About The project
ZoteroToGarganText isn't usable right now, need modification to transform txt and pdf file from zotero into tsv for gargantext
## Usage
```
shell
python3 ZoteroToGarganText.py
```
This diff is collapsed.
Click to expand it.
Conversion/ToTSV/ZoteroToGarganText/ZoteroToGarganText.py
0 → 100644
View file @
5635c954
from
pyzotero
import
zotero
from
datetime
import
date
def
getDataFromWebPage
(
item
):
# Title
title
=
item
[
'data'
][
'title'
]
# Authors
if
'creators'
in
item
[
'data'
]
.
keys
():
authors
=
[]
for
author
in
item
[
'data'
][
'creators'
]:
authors
.
append
(
author
[
'lastName'
])
authors
=
';'
.
join
(
authors
)
else
:
authors
=
''
# Source
source
=
item
[
'data'
][
'url'
]
# Abstract
if
'abstractNote'
in
item
[
'data'
]
.
keys
():
abstract
=
item
[
'data'
][
'abstractNote'
]
else
:
abstract
=
''
# Date
if
'date'
in
item
[
'data'
]
.
keys
()
and
item
[
'data'
][
'date'
]
!=
''
:
pdate
=
item
[
'data'
][
'date'
]
.
split
(
'-'
)
pdate
[
2
]
=
pdate
[
2
]
.
split
(
'T'
)[
0
]
pdate
=
'
\t
'
.
join
(
pdate
)
else
:
pdate
=
str
(
date
.
today
()
.
year
)
+
'
\t
1
\t
1'
abstract
=
abstract
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
.
replace
(
'
\n
'
,
''
)
title
=
title
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
.
replace
(
'
\n
'
,
''
)
source
=
source
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\n
'
,
''
)
# Output
return
str
(
title
)
+
"
\t
"
+
source
+
"
\t
"
+
str
(
pdate
)
+
"
\t
"
+
abstract
+
"
\t
"
+
authors
+
"
\t
"
+
str
(
1
)
+
"
\n
"
def
makeTSV
(
items
):
txt
=
"title
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
abstract
\t
authors
\t
weight
\n
"
for
item
in
items
:
if
item
[
'data'
][
'itemType'
]
in
[
'webpage'
,
'encyclopediaArticle'
,
'blogPost'
]:
txt
+=
getDataFromWebPage
(
item
)
elif
item
[
'data'
][
'itemType'
]
==
'attachment'
:
#with open('tmp/' + item['data']['title'], 'wb') as f:
# f.write(zot.file(item['data']['key']))
print
(
item
)
else
:
print
(
"??"
)
#print(item['data']['itemType'])
with
open
(
'output.tsv'
,
'w'
)
as
f
:
f
.
write
(
txt
)
print
(
"Id:"
)
id
=
input
()
zot
=
zotero
.
Zotero
(
id
,
'user'
)
print
(
"Items (i)/ Collection (c)"
)
t
=
input
()
if
t
==
'i'
:
print
(
'Search :'
)
search
=
input
()
zot
.
add_parameters
(
q
=
search
)
items
=
zot
.
top
()
else
:
docs
=
zot
.
collections
()
tmp
=
{}
print
(
'Collection :'
)
for
doc
in
docs
:
tmp
[
doc
[
'data'
][
'name'
]]
=
doc
[
'data'
][
'key'
]
print
(
doc
[
'data'
][
'name'
])
print
(
"choose collection"
)
col
=
input
()
items
=
[]
for
elem
in
col
.
split
(
' '
):
items
+=
zot
.
collection_items
(
tmp
[
elem
])
txt
=
makeTSV
(
items
)
This diff is collapsed.
Click to expand it.
Conversion/ToTSV/isidoreToTSV/IsidoreAPIToGarganText.py
0 → 100644
View file @
5635c954
import
requests
as
req
import
json
import
sys
from
datetime
import
date
# python3 IsidoreAPIToGarganText search nb_replies language
# ex : python3 IsidoreAPIToGarganText "brain muscle" 100 fra
try
:
search
=
sys
.
argv
[
1
]
replies
=
sys
.
argv
[
2
]
language
=
sys
.
argv
[
3
]
except
:
print
(
"! args error
\n
"
)
sys
.
exit
(
0
)
if
replies
>
1000
:
print
(
"The number of replier must be less than 1000"
)
sys
.
exit
(
0
)
url
=
'https://api.isidore.science/resource/search?q='
+
search
+
'&output=json&replies='
+
replies
+
'&language=http://lexvo.org/id/iso639-3/'
+
language
resp
=
req
.
get
(
url
)
jsontxt
=
json
.
loads
(
resp
.
content
)
docs
=
jsontxt
[
"response"
][
"replies"
][
"content"
][
"reply"
]
# Output File
output
=
open
(
"output.csv"
,
"w"
)
header
=
"title
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
abstract
\t
authors
\t
weight
\n
"
output
.
write
(
header
)
for
doc
in
docs
:
# Title
title
=
doc
[
"isidore"
][
"title"
]
if
(
type
(
title
)
!=
str
):
if
(
type
(
title
)
==
list
):
tmp
=
''
for
lang
in
title
:
if
type
(
lang
)
!=
str
and
lang
[
'@xml:lang'
]
==
language
[:
2
]:
tmp
=
lang
[
'$'
]
if
tmp
==
''
:
if
type
(
title
[
0
])
==
str
:
title
=
title
[
0
]
else
:
title
=
title
[
0
][
'$'
]
else
:
title
=
tmp
else
:
title
=
title
[
'$'
]
# Source
source
=
doc
[
"isidore"
][
"source_info"
][
"sourceName"
][
"$"
]
# Author
if
doc
[
'isidore'
][
'enrichedCreators'
]
!=
[]:
list_author
=
doc
[
"isidore"
][
"enrichedCreators"
][
"creator"
]
authors
=
[]
if
(
type
(
list_author
)
==
list
):
for
author
in
list_author
:
authors
.
append
(
author
[
"@origin"
])
authors
=
';'
.
join
(
authors
)
else
:
authors
=
list_author
[
"@origin"
]
else
:
authors
=
''
#Abstract
if
'abstract'
in
doc
[
'isidore'
]
.
keys
()
and
doc
[
"isidore"
][
"abstract"
]
!=
[]:
abstract
=
doc
[
"isidore"
][
"abstract"
]
else
:
abstract
=
''
if
(
type
(
abstract
)
!=
str
):
if
type
(
abstract
)
==
list
:
tmp
=
''
for
lang
in
abstract
:
if
type
(
lang
)
!=
str
and
lang
[
'@xml:lang'
]
==
language
[:
2
]:
tmp
=
lang
[
'$'
]
if
tmp
==
''
:
if
type
(
abstract
[
0
])
==
str
:
abstract
=
abstract
[
0
]
else
:
abstract
=
abstract
[
0
][
'$'
]
else
:
abstract
=
tmp
else
:
abstract
=
abstract
[
'$'
]
# Publication Date
try
:
pdate
=
'
\t
'
.
join
(
doc
[
"isidore"
][
"date"
][
"normalizedDate"
]
.
split
(
'-'
))
except
Exception
as
e
:
pdate
=
str
(
date
.
today
()
.
year
)
+
'
\t
01
\t
01'
abstract
=
abstract
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
title
=
title
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
source
=
source
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
.
replace
(
'
\n
'
,
''
)
# Output
row
=
str
(
title
)
+
"
\t
"
+
source
+
"
\t
"
+
pdate
+
"
\t
"
+
abstract
+
"
\t
"
+
authors
+
"
\t
"
+
str
(
1
)
+
"
\n
"
output
.
write
(
row
)
This diff is collapsed.
Click to expand it.
Conversion/ToTSV/isidoreToTSV/README.md
0 → 100644
View file @
5635c954
# IsidoreAPIToGargantext
## About The project
IsidoreAPIToGargantext call isidore API to make a research using the parameter given and create a TSV file usable in GraganText
## Usage
```
shell
python3 IsidoreAPIToGargantext.py search replies lang
```
search is what you want tu search in Isidore
replies is the number of reply take fron the answer of Isidore
lang is the language (see note)
Output a TSV legacy corpus named output.tsv
## Date
This script have been last updated the 2023/07/24.
It can be outdated if the futur.
## Note
language | lang | work?
| :--- |:--- |:---
French | fra | fine
English | eng | fine
Deutch | deu | fine
Spanish | spa | fine
Italian | ita | fine
Portuguese | por | fine
Polish | nld | low answer
Russian | rus | low answer
Chiniese | lzh | should work but don't actually
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Streamlit/.streamlit/pages.toml
View file @
5635c954
...
...
@@ -79,3 +79,7 @@ is_section = true
path
=
"pages/Merge_Term_GarganText.py"
name
=
"Merge GarganText Terms"
[[pages]]
path
=
"pages/GEXF_To_TermOcc.py"
name
=
"GEXF To Term"
This diff is collapsed.
Click to expand it.
Streamlit/lang/text_GEXFToTermOcc.csv
0 → 100644
View file @
5635c954
locale,key,value
fr,title,"# Term / Occurrence"
en,title,"# Json To TSV"
fr,text,"Transforme un fichier GEXF venant du graphe de GarganText en un fichier TSV de terme et d'occurrence."
en,text,"Transform a GEXF file of a graph from GarganText to a TSV file of term and occurrence."
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Téléchargez votre fichier TSV :"
en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide"
en,error,"Error : the file isn't valid"
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Streamlit/lang/text_IsidoreToGarganText.csv
View file @
5635c954
...
...
@@ -2,7 +2,7 @@ locale,key,value
fr,title,"# Isidore vers GarganText"
en,title,"# Isidore To GarganText"
fr,text,"Effectue une recherche Isidore de documents scientifiques et les converti
t
en un fichier TSV."
fr,text,"Effectue une recherche Isidore de documents scientifiques et les converti
r
en un fichier TSV."
en,text,"Do a Isidore scientific documents research and convert it into a TSV file."
fr,keyword,"Mots clés"
...
...
@@ -23,10 +23,10 @@ en,overload'api,"The API is overloaded, please retry the request in a few second
fr,nb_doc,"Nombre de documents : "
en,nb_doc,"Number of documents : "
fr,perform1,"Pour des raisons de perform
ence
, on limite à "
fr,perform2," le nombre
maximum de documents.
"
fr,perform1,"Pour des raisons de perform
ances
, on limite à "
fr,perform2," le nombre
de documents maximums
"
en,perform1,"For performance reasons, we limit to "
en,perform2,"
,the maximum number of documents.
"
en,perform2,"
the maximum number of documents
"
fr,nb_taken,"Nombre de documents à prendre"
en,nb_taken,"Number of documents to take into account"
...
...
@@ -35,6 +35,6 @@ fr,createTSV,"Création du fichier TSV (cela peut prendre quelques minutes)"
en,createTSV,"Creation of the TSV file (it may take a while)"
fr,doc_abstract1,"Il y a "
fr,doc_abstract2," documents qui peuvent ne pas avoir de description."
fr,doc_abstract2," documents qui peuvent ne pas avoir de description
s
."
en,doc_abstract1,"There are "
en,doc_abstract2," documents who may not have an abstract"
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Streamlit/lang/text_IstexToGarganText.csv
View file @
5635c954
...
...
@@ -11,7 +11,7 @@ en,file,"Choose a file"
fr,dup1,"Certains fichiers ("
fr,dup2,") ont été retirés pour diverses raisons (fichier au mauvais format, fichiers identiques au regard des données utilisées par GarganText...)"
en,dup1,"Some file ("
en,dup2,") have been removed for various reasons (
file with wrong format, file already presen
t...)"
en,dup2,") have been removed for various reasons (
especially indentic file, unusable forma
t...)"
fr,new_file,"Télécharger le fichier TSV :"
en,new_file,"Download the TSV file:"
...
...
This diff is collapsed.
Click to expand it.
Streamlit/lang/text_PubMedToGarganText.csv
View file @
5635c954
...
...
@@ -11,5 +11,5 @@ en,file,"Choose a file"
fr,new_file,"Télécharger le fichier TSV :"
en,new_file,"Download le TSV file:"
fr,error,"Erreur : le fichier n'est pas valide"
en,error,"Error : the file isn't valid"
\ No newline at end of file
fr,error,"Erreur : le fichier n'est pas valide !"
en,error,"Error : the file isn't valid !"
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Streamlit/lang/text_YTBtoTSV.csv
View file @
5635c954
...
...
@@ -35,5 +35,5 @@ en,loading,"Videos processing : "
fr,quantity," sur "
en,quantity," out of "
fr,new_file,"Télécharge
ton
fichier TSV :"
fr,new_file,"Télécharge
z votre
fichier TSV :"
en,new_file,"Download your TSV file :"
This diff is collapsed.
Click to expand it.
Streamlit/lang/text_ZoteroToGarganText.csv
View file @
5635c954
...
...
@@ -14,8 +14,8 @@ en,submit,"Submit"
fr,denied,"L'accès au compte n'est pas public, pour le rendre public: https://www.zotero.org/settings/privacy"
en,denied,"Account access is not public, to make it public: https://www.zotero.org/settings/privacy"
fr,add_doc,"*Ajoute
r
les documents que vous voulez mettre dans le TSV*"
en,add_doc,"*Add the document that
t
ou want in the TSV*"
fr,add_doc,"*Ajoute
z
les documents que vous voulez mettre dans le TSV*"
en,add_doc,"*Add the document that
y
ou want in the TSV*"
fr,select_all,"Select All"
en,select_all,"Select All"
...
...
This diff is collapsed.
Click to expand it.
Streamlit/pages/Clean_CSV_to_TSV.py
View file @
5635c954
...
...
@@ -39,54 +39,29 @@ def getSeparator(file):
return
'
\t
'
,
False
def
checkPublicationCase
(
tmp
,
split
,
success
):
if
split
:
if
tmp
[
0
][
0
]
.
isupper
()
or
tmp
[
1
][
0
]
.
isupper
():
return
False
else
:
return
success
if
not
tmp
[
0
][
0
]
.
isupper
()
or
not
tmp
[
1
][
0
]
.
isupper
():
return
False
return
success
def
checkPublication
(
name
,
registeredNames
,
errorMessage
):
tmpName
=
name
def
lowerName
(
name
):
tmp
=
name
if
re
.
search
(
'[a-zA-Z0-9]'
,
name
[
0
])
==
None
:
tmpName
=
name
[
1
:]
tmp
=
tmpName
.
split
(
' '
)
success
=
True
tmp
=
name
[
1
:]
if
len
(
tmp
)
<
9
:
return
tmp
.
lower
()
tmp
=
name
.
split
(
' '
)
split
=
False
first
=
""
second
=
""
if
"_"
in
tmp
[
0
]
and
len
(
tmp
)
==
1
:
if
len
(
tmp
)
==
1
and
"_"
in
tmp
[
0
]
:
tmp
=
tmp
[
0
]
.
split
(
'_'
)
split
=
True
if
len
(
tmp
)
!=
2
:
success
=
False
return
name
.
lower
()
else
:
success
=
checkPublicationCase
(
tmp
,
split
,
success
)
first
=
tmp
[
0
][
0
]
.
lower
()
+
tmp
[
0
][
1
:]
second
=
tmp
[
1
][
0
]
.
lower
()
+
tmp
[
1
][
1
:]
if
first
!=
"publication"
or
second
not
in
[
"day"
,
"month"
,
"year"
]:
success
=
False
if
not
success
:
errorMessage
+=
"Error at line 1 ! Wrong name : "
+
\
name
+
" is not appropriated !
\n
"
else
:
registeredNames
.
append
(
first
+
"_"
+
second
)
return
success
,
errorMessage
return
first
+
"_"
+
second
def
checkNameValidity
(
name
,
columnNames
,
registeredNames
,
errorMessage
):
tmpName
=
name
if
re
.
search
(
'[a-zA-Z0-9]'
,
name
[
0
])
==
None
:
tmpName
=
name
[
1
:]
if
tmpName
not
in
columnNames
:
errorMessage
+=
"Error at line 1 ! Wrong name : "
+
\
name
+
" is not appropriated !
\n
"
return
False
,
errorMessage
if
tmpName
in
registeredNames
:
if
name
in
registeredNames
:
errorMessage
+=
"Error at line 1 ! Same name for 2 differents columns!
\n
"
return
False
,
errorMessage
return
True
,
errorMessage
...
...
@@ -105,23 +80,30 @@ def checkColumnExistence(registeredNames, errorMessage):
return
True
,
errorMessage
def
checkColumnNames
(
name
,
errorMessage
,
registeredNames
,
success
):
columnNames
=
[
"authors"
,
"title"
,
"publication_year"
,
"publication_month"
,
"publication_day"
,
"abstract"
,
"source"
]
def
checkColumnNames
(
name
,
errorMessage
,
registeredNames
,
otherColumns
,
success
):
columnNames
=
[
"authors"
,
"title"
,
"
source"
,
"
publication_year"
,
"publication_month"
,
"publication_day"
,
"abstract"
]
name
=
name
.
replace
(
"
\n
"
,
""
)
if
len
(
name
)
>
9
:
tmpSuccess
,
errorMessage
=
checkPublication
(
name
,
registeredNames
,
errorMessage
)
else
:
name
=
name
.
replace
(
" "
,
""
)
tmpSuccess
,
errorMessage
=
checkNameValidity
(
name
[
0
]
.
lower
()
+
name
[
1
:],
columnNames
,
registeredNames
,
errorMessage
)
if
tmpSuccess
:
registeredNames
.
append
(
name
[
0
]
.
lower
()
+
name
[
1
:])
if
success
:
tmpSuccess
,
errorMessage
=
checkNameValidity
(
name
,
columnNames
,
registeredNames
,
errorMessage
)
if
tmpSuccess
:
if
lowerName
(
name
)
in
columnNames
:
registeredNames
.
append
(
name
)
else
:
otherColumns
.
append
(
name
)
if
success
:
success
=
tmpSuccess
return
success
,
errorMessage
,
registeredNames
return
errorMessage
,
registeredNames
,
otherColumns
,
success
def
addColumnsNamestoTSV
(
data
,
registeredNames
,
otherColumns
):
for
name
in
registeredNames
:
if
data
!=
""
:
data
+=
"
\t
"
data
+=
name
for
name
in
otherColumns
:
data
+=
"
\t
"
data
+=
name
return
data
def
getColumnsNames
(
file
,
separator
,
errorMessage
):
data
=
""
...
...
@@ -130,40 +112,21 @@ def getColumnsNames(file, separator, errorMessage):
success
=
True
reader
=
csv
.
DictReader
(
codecs
.
iterdecode
(
file
,
'utf-8'
),
delimiter
=
separator
)
columnsName
s
=
[]
othersColumn
s
=
[]
for
row
in
reader
:
for
name
,
value
in
row
.
items
():
columnName
=
name
.
replace
(
"
\ufeff
"
,
""
)
if
(
columnNb
<
7
):
success
,
errorMessage
,
registeredNames
=
checkColumnNames
(
name
,
errorMessage
,
registeredNames
,
success
)
if
data
!=
""
:
data
+=
"
\t
"
data
+=
columnName
columnNb
+=
1
errorMessage
,
registeredNames
,
otherColumns
,
success
=
checkColumnNames
(
name
,
errorMessage
,
registeredNames
,
othersColumns
,
success
)
success
,
errorMessage
=
checkColumnExistence
(
registeredNames
,
errorMessage
)
if
success
:
data
=
addColumnsNamestoTSV
(
data
,
registeredNames
,
otherColumns
)
break
data
+=
"
\n
"
return
data
,
success
,
errorMessage
def
lowerName
(
name
):
tmp
=
name
.
split
(
' '
)
split
=
False
first
=
""
second
=
""
if
len
(
tmp
)
==
1
and
"_"
in
tmp
[
0
]:
tmp
=
tmp
[
0
]
.
split
(
'_'
)
split
=
True
if
len
(
tmp
)
!=
2
:
return
name
.
lower
()
else
:
first
=
tmp
[
0
][
0
]
.
lower
()
+
tmp
[
0
][
1
:]
second
=
tmp
[
1
][
0
]
.
lower
()
+
tmp
[
1
][
1
:]
return
first
+
"_"
+
second
def
checkDate
(
name
,
value
,
success
,
fill
,
csvLine
,
errorMessage
):
if
name
in
[
"publication_year"
,
"publication_month"
,
"publication_day"
]:
if
value
==
""
or
value
==
"
\n
"
:
...
...
@@ -210,43 +173,45 @@ def correctedSequence(text):
tmp
=
"
\"
"
+
tmp
+
"
\"
"
return
tmp
def
getContent
(
file
,
separator
,
data
,
success
,
fill
,
errorMessage
):
reader
=
csv
.
DictReader
(
codecs
.
iterdecode
(
file
,
'utf-8'
),
delimiter
=
separator
)
columnNames
=
[
"authors"
,
"title"
,
"source"
,
"publication_year"
,
"publication_month"
,
"publication_day"
,
"abstract"
]
csvLine
=
2
columnNb
=
0
reader
=
csv
.
DictReader
(
codecs
.
iterdecode
(
file
,
'utf-8'
),
delimiter
=
separator
)
for
row
in
reader
:
tmp
=
""
first
=
True
tsv1
=
""
tsv2
=
""
for
name
,
value
in
row
.
items
():
tmpFill
=
""
if
not
first
:
tmp
+=
"
\t
"
else
:
first
=
False
if
(
columnNb
<
7
):
if
lowerName
(
name
)
in
columnNames
:
if
not
first
:
tsv1
+=
"
\t
"
success
,
tmpFill
,
errorMessage
=
checkMissing
(
lowerName
(
name
),
value
,
success
,
fill
,
csvLine
,
errorMessage
)
if
tmpFill
!=
""
:
t
mp
+=
tmpFill
t
sv1
+=
tmpFill
else
:
success
,
tmpFill
,
errorMessage
=
checkDate
(
lowerName
(
name
),
value
,
success
,
fill
,
csvLine
,
errorMessage
)
tmp
+=
correctedSequence
(
value
)
else
:
tmp
+=
correctedSequence
(
value
)
columnNb
+=
1
columnNb
=
0
tsv1
+=
correctedSequence
(
value
)
else
:
success
,
tmpFill
,
errorMessage
=
checkMissing
(
lowerName
(
name
),
value
,
success
,
fill
,
csvLine
,
errorMessage
)
if
tmpFill
!=
""
:
tsv2
+=
"
\t
"
+
tmpFill
else
:
tsv2
+=
"
\t
"
+
correctedSequence
(
value
)
if
first
:
first
=
False
csvLine
+=
1
data
+=
t
mp
+
"
\n
"
data
+=
t
sv1
+
tsv2
+
"
\n
"
return
data
[:
-
1
],
success
,
errorMessage
# Code End
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
session_state
.
fill
=
st
.
checkbox
(
st
.
session_state
.
general_text_dict
[
'fill'
])
st
.
session_state
.
fill
=
st
.
checkbox
(
value
=
True
,
label
=
st
.
session_state
.
general_text_dict
[
'fill'
])
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"tsv"
,
"csv"
],
key
=
'file'
)
...
...
This diff is collapsed.
Click to expand it.
Streamlit/pages/GEXF_To_TermOcc.py
0 → 100644
View file @
5635c954
import
streamlit
as
st
import
networkx
as
nx
import
src.basic
as
tmp
tmp
.
base
(
'GEXFToTermOcc'
)
def
create_file
(
file
):
tmp
=
file
.
getvalue
()
.
decode
(
'utf-8'
)
\
.
replace
(
'version="1.3"'
,
'version="1.2"'
)
\
.
replace
(
'xmlns="http://www.gexf.net/1.3"'
,
'xmlns="http://www.gexf.net/1.2draft"'
)
\
.
replace
(
'xmlns:viz="http://gexf.net/1.3/viz"'
,
'xmlns:viz="http://www.gexf.net/1.2draft/viz"'
)
\
.
replace
(
'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"'
,
'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"'
)
\
.
replace
(
'xsi:schemaLocation="http://gexf.net/1.3 http://gexf.net/1.3/gexf.xsd"'
,
'xsi:schemaLocation="http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd"'
)
\
.
encode
()
file
.
seek
(
0
,
0
)
file
.
write
(
tmp
)
file
.
seek
(
0
,
0
)
tmp
=
nx
.
read_gexf
(
file
,
version
=
'1.2draft'
)
lst
=
[]
for
elem
in
tmp
.
nodes
(
True
):
lst
.
append
((
elem
[
1
][
'label'
],
elem
[
1
][
'viz'
][
'size'
]))
lst
.
sort
(
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
res
=
'mapTerm
\t
occ
\n
'
for
elem
in
lst
:
res
+=
elem
[
0
]
+
'
\t
'
+
str
(
int
(
elem
[
1
]))
+
'
\n
'
return
res
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"gexf"
],
key
=
'file'
)
if
file
:
try
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
'Download TSV'
,
create_file
(
file
),
'output.csv'
)
except
Exception
as
e
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'error'
])
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Streamlit/pages/HAL_To_GarganText.py
View file @
5635c954
...
...
@@ -171,10 +171,10 @@ if st.session_state.stage_isidore > 0:
form2
.
write
(
st
.
session_state
.
general_text_dict
[
'perform1'
]
+
str
(
limitItems
)
+
st
.
session_state
.
general_text_dict
[
'perform2'
])
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
,
limitItems
)
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
0
,
limitItems
,
10
,
10
)
else
:
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
,
int
(
st
.
session_state
.
nb_doc
)
)
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
0
,
int
(
st
.
session_state
.
nb_doc
),
10
,
10
)
form2
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
set_stage
,
args
=
(
2
,))
...
...
This diff is collapsed.
Click to expand it.
Streamlit/pages/Isidore_To_GarganText.py
View file @
5635c954
...
...
@@ -19,7 +19,7 @@ numberReplies = 500 # Dont' exceed 1 000
limitItems
=
5000
# Can't be superior of 10 times numberReplies
retryTime
=
2
## Connect to Isidore API to get the numbers of docs from the research
def
loadApiIsidoreNumberFile
(
search
,
language
):
while
(
True
):
url
=
'https://api.isidore.science/resource/search?q='
+
search
+
\
...
...
@@ -39,7 +39,7 @@ def loadApiIsidoreNumberFile(search, language):
return
docs
## Connect to Isidore API to get the documents from the pages
def
loadApiIsidorePage
(
search
,
language
,
page
):
url
=
'https://api.isidore.science/resource/search?q='
+
search
+
'&output=json&replies='
+
\
str
(
numberReplies
)
+
'&page='
+
str
(
page
)
+
\
...
...
@@ -58,6 +58,7 @@ def loadApiIsidorePage(search, language, page):
def
create_output
(
search
,
language
,
nb_doc
):
output
=
"title
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
abstract
\t
authors
\t
weight
\n
"
nb
=
0
## nb is used to return ther number of file with
for
i
in
range
(
1
,
nb_doc
//
numberReplies
+
1
):
while
(
True
):
txt
=
loadApiIsidorePage
(
search
,
language
,
i
)
...
...
@@ -68,6 +69,8 @@ def create_output(search, language, nb_doc):
tmp
,
nb_tmp
=
createFile
(
txt
,
numberReplies
,
language
)
output
+=
tmp
nb
+=
nb_tmp
## If their is still some document do find (for exampe with 1160 documents, their is still 160 documents to find after the first part)
if
nb_doc
%
numberReplies
!=
0
:
while
(
True
):
txt
=
loadApiIsidorePage
(
search
,
language
,
nb_doc
//
numberReplies
+
1
)
...
...
@@ -151,7 +154,6 @@ def createFile(docs, limit, language):
abstract
=
''
if
'types'
in
doc
[
'isidore'
]
.
keys
():
print
(
i
)
if
type
(
doc
[
'isidore'
][
'types'
][
'type'
])
==
str
and
doc
[
'isidore'
][
'types'
][
'type'
]
in
[
'Books'
,
'text'
]:
nb
+=
1
elif
type
(
doc
[
'isidore'
][
'types'
][
'type'
])
==
dict
and
doc
[
'isidore'
][
'types'
][
'type'
][
'$'
]
in
[
'Books'
,
'text'
]:
...
...
@@ -249,7 +251,7 @@ form.form_submit_button(
# API and Slider
if
st
.
session_state
.
stage_isidore
>
0
:
# Only call first time and after
# Only call first time and after
an update in the first form
if
'search'
not
in
st
.
session_state
or
'language'
not
in
st
.
session_state
or
search
!=
st
.
session_state
.
search
or
language
!=
st
.
session_state
.
language
:
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'load_api'
]):
nb_doc
=
int
(
loadApiIsidoreNumberFile
(
search
,
lang
[
language
]))
...
...
@@ -269,10 +271,10 @@ if st.session_state.stage_isidore > 0:
form2
.
write
(
st
.
session_state
.
general_text_dict
[
'perform1'
]
+
str
(
limitItems
)
+
st
.
session_state
.
general_text_dict
[
'perform2'
])
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
,
limitItems
)
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
0
,
limitItems
,
10
,
10
)
else
:
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
,
int
(
st
.
session_state
.
nb_doc
)
)
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
0
,
int
(
st
.
session_state
.
nb_doc
),
10
,
10
)
form2
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
set_stage
,
args
=
(
2
,))
...
...
This diff is collapsed.
Click to expand it.
Streamlit/pages/Istex_To_GarganText.py
View file @
5635c954
...
...
@@ -5,7 +5,7 @@ Loïc Chapron
import
json
import
pandas
as
pd
import
datetime
from
datetime
import
datetime
import
zipfile
import
streamlit
as
st
import
src.basic
as
tmp
...
...
@@ -60,8 +60,6 @@ def read_zip(zip_file):
temp
[
"publication_year"
]
=
article
[
"publicationDate"
][
0
]
except
:
temp
[
"publication_year"
]
=
datetime
.
date
.
today
()
.
year
temp
[
"publication_year"
]
=
article
.
get
(
"publicationDate"
,
datetime
.
date
.
today
()
.
year
)[
0
]
temp
[
"publication_month"
]
=
1
temp
[
"publication_day"
]
=
1
...
...
@@ -78,7 +76,7 @@ def read_zip(zip_file):
if
(
duplicated
.
any
()):
dup
+=
duplicated
.
sum
()
output
.
drop
([
'code'
],
axis
=
1
)
output
=
output
.
drop
([
'code'
],
axis
=
1
)
output
=
output
[
~
duplicated
]
df
=
pd
.
DataFrame
(
output
)
return
df
.
to_csv
(
index
=
False
,
sep
=
'
\t
'
),
dup
...
...
This diff is collapsed.
Click to expand it.
Streamlit/pages/PDF_to_TSV.py
View file @
5635c954
...
...
@@ -13,6 +13,8 @@ import re
import
chardet
import
pandas
as
pd
import
streamlit
as
st
import
lib.tika.tika
as
tika
tika
.
initVM
()
from
lib.tika.tika
import
parser
from
lib.langdetect.langdetect
import
detect
from
lib.langdetect.langdetect.lang_detect_exception
import
LangDetectException
...
...
@@ -136,7 +138,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day
count
=
1
languages
=
{}
while
n
<
nbLines
-
2
:
doc
=
"
\n
"
.
join
(
abstract
[
n
:
n
+
9
])
.
replace
(
"�"
,
""
)
doc
=
"
\n
"
.
join
(
abstract
[
n
:
n
+
9
])
.
replace
(
"�"
,
""
)
.
replace
(
""
,
""
)
title
=
source
+
" : Part "
+
str
(
count
)
tsv
+=
correctedSequence
(
author
,
False
)
+
"
\t
"
+
correctedSequence
(
source
,
False
)
+
"
\t
"
+
year
+
"
\t
"
+
month
+
"
\t
"
+
day
+
"
\t
"
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment