Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
GarganTexternal tools
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
GarganTexternal tools
Commits
5635c954
Commit
5635c954
authored
Sep 27, 2023
by
Anne-Laure Thomas Derepas
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev' into 'master'
Dev See merge request
athomas/gargantexternal-tools!13
parents
41e00418
22967fb8
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
494 additions
and
116 deletions
+494
-116
Istex2ggtx.py
Conversion/ToTSV/IsTexToGarganText/Istex2ggtx.py
+88
-0
README.md
Conversion/ToTSV/IsTexToGarganText/README.md
+15
-0
istex-subset-2023-07-17.zip
...oTSV/IsTexToGarganText/sample/istex-subset-2023-07-17.zip
+0
-0
istex-subset-2023-07-19.zip
...oTSV/IsTexToGarganText/sample/istex-subset-2023-07-19.zip
+0
-0
README.md
Conversion/ToTSV/ZoteroToGarganText/README.md
+11
-0
ZoteroToGarganText.py
Conversion/ToTSV/ZoteroToGarganText/ZoteroToGarganText.py
+95
-0
IsidoreAPIToGarganText.py
Conversion/ToTSV/isidoreToTSV/IsidoreAPIToGarganText.py
+102
-0
README.md
Conversion/ToTSV/isidoreToTSV/README.md
+38
-0
pages.toml
Streamlit/.streamlit/pages.toml
+4
-0
text_GEXFToTermOcc.csv
Streamlit/lang/text_GEXFToTermOcc.csv
+15
-0
text_IsidoreToGarganText.csv
Streamlit/lang/text_IsidoreToGarganText.csv
+5
-5
text_IstexToGarganText.csv
Streamlit/lang/text_IstexToGarganText.csv
+1
-1
text_PubMedToGarganText.csv
Streamlit/lang/text_PubMedToGarganText.csv
+2
-2
text_YTBtoTSV.csv
Streamlit/lang/text_YTBtoTSV.csv
+1
-1
text_ZoteroToGarganText.csv
Streamlit/lang/text_ZoteroToGarganText.csv
+2
-2
Clean_CSV_to_TSV.py
Streamlit/pages/Clean_CSV_to_TSV.py
+57
-92
GEXF_To_TermOcc.py
Streamlit/pages/GEXF_To_TermOcc.py
+43
-0
HAL_To_GarganText.py
Streamlit/pages/HAL_To_GarganText.py
+2
-2
Isidore_To_GarganText.py
Streamlit/pages/Isidore_To_GarganText.py
+8
-6
Istex_To_GarganText.py
Streamlit/pages/Istex_To_GarganText.py
+2
-4
PDF_to_TSV.py
Streamlit/pages/PDF_to_TSV.py
+3
-1
No files found.
Conversion/ToTSV/IsTexToGarganText/Istex2ggtx.py
0 → 100644
View file @
5635c954
#!/usr/bin/env python
# coding: utf-8
# In[74]:
import
json
import
pandas
as
pd
import
numpy
as
np
import
glob
import
sys
import
datetime
# In[80]:
import
zipfile
# In[83]:
input_file
=
sys
.
argv
[
1
]
output_file
=
sys
.
argv
[
2
]
# In[84]:
#list_articles=glob.glob("tmp/*/*.json")
# In[85]:
output
=
[]
with
zipfile
.
ZipFile
(
input_file
,
'r'
)
as
zip_ref
:
for
file
in
zip_ref
.
namelist
():
if
file
.
split
(
'.'
)[
1
]
!=
'json'
or
file
.
split
(
'.'
)[
0
]
==
'manifest'
:
continue
try
:
article
=
json
.
load
(
zip_ref
.
open
(
file
))
temp
=
{}
temp
[
"title"
]
=
article
.
get
(
"title"
,
""
)
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
temp
[
"abstract"
]
=
article
.
get
(
"abstract"
,
""
)
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
authors
=
""
for
author
in
article
.
get
(
"author"
,[]):
authors
+=
author
[
"name"
]
+
", "
authors
=
authors
[:
-
2
]
temp
[
"code"
]
=
article
.
get
(
"_id"
)
temp
[
"authors"
]
=
authors
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
.
replace
(
","
,
";"
)
temp
[
"source"
]
=
article
[
"host"
][
"title"
]
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
temp
[
"publication_year"
]
=
article
.
get
(
"publicationDate"
,
datetime
.
date
.
today
()
.
year
)
temp
[
"publication_month"
]
=
1
temp
[
"publication_day"
]
=
1
output
.
append
(
temp
)
except
Exception
as
e
:
print
(
file
,
e
)
# In[86]:
output
=
pd
.
DataFrame
(
output
)
tmp
=
output
.
size
duplicated
=
output
[
'title'
]
.
str
.
lower
()
.
replace
(
","
,
""
,
regex
=
True
)
.
duplicated
()
if
(
duplicated
.
any
()):
print
(
"
\n
Quelques fichiers n'ont pas été introduits dans le TSV car ils pourraient apparaitre plusieurs fois:"
)
for
i
in
range
(
0
,
output
[
"title"
]
.
size
-
1
):
if
(
duplicated
[
i
]):
print
(
"
\t
"
+
output
[
"code"
][
i
]
+
" "
+
output
[
"title"
][
i
])
output
.
drop
([
'code'
],
axis
=
1
)
output
=
output
[
~
duplicated
]
# In[87]:
output
.
to_csv
(
output_file
,
sep
=
'
\t
'
,
index
=
False
)
print
(
""
)
# In[ ]:
Conversion/ToTSV/IsTexToGarganText/README.md
0 → 100644
View file @
5635c954
# IstexToGargantext
## About The project
IstexToGargantext convert a zip file from Istex into a TSV file for GarganText
## Usage
```
shell
python3 Istex2ggtx.py file.zip
```
## Date
This script have been last updated the 2023/07/24.
It can be outdated if the futur.
Conversion/ToTSV/IsTexToGarganText/sample/istex-subset-2023-07-17.zip
0 → 100644
View file @
5635c954
File added
Conversion/ToTSV/IsTexToGarganText/sample/istex-subset-2023-07-19.zip
0 → 100644
View file @
5635c954
File added
Conversion/ToTSV/ZoteroToGarganText/README.md
0 → 100644
View file @
5635c954
# ZoteroToGargantext
## About The project
ZoteroToGarganText isn't usable right now, need modification to transform txt and pdf file from zotero into tsv for gargantext
## Usage
```
shell
python3 ZoteroToGarganText.py
```
Conversion/ToTSV/ZoteroToGarganText/ZoteroToGarganText.py
0 → 100644
View file @
5635c954
from
pyzotero
import
zotero
from
datetime
import
date
def
getDataFromWebPage
(
item
):
# Title
title
=
item
[
'data'
][
'title'
]
# Authors
if
'creators'
in
item
[
'data'
]
.
keys
():
authors
=
[]
for
author
in
item
[
'data'
][
'creators'
]:
authors
.
append
(
author
[
'lastName'
])
authors
=
';'
.
join
(
authors
)
else
:
authors
=
''
# Source
source
=
item
[
'data'
][
'url'
]
# Abstract
if
'abstractNote'
in
item
[
'data'
]
.
keys
():
abstract
=
item
[
'data'
][
'abstractNote'
]
else
:
abstract
=
''
# Date
if
'date'
in
item
[
'data'
]
.
keys
()
and
item
[
'data'
][
'date'
]
!=
''
:
pdate
=
item
[
'data'
][
'date'
]
.
split
(
'-'
)
pdate
[
2
]
=
pdate
[
2
]
.
split
(
'T'
)[
0
]
pdate
=
'
\t
'
.
join
(
pdate
)
else
:
pdate
=
str
(
date
.
today
()
.
year
)
+
'
\t
1
\t
1'
abstract
=
abstract
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
.
replace
(
'
\n
'
,
''
)
title
=
title
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
.
replace
(
'
\n
'
,
''
)
source
=
source
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\n
'
,
''
)
# Output
return
str
(
title
)
+
"
\t
"
+
source
+
"
\t
"
+
str
(
pdate
)
+
"
\t
"
+
abstract
+
"
\t
"
+
authors
+
"
\t
"
+
str
(
1
)
+
"
\n
"
def
makeTSV
(
items
):
txt
=
"title
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
abstract
\t
authors
\t
weight
\n
"
for
item
in
items
:
if
item
[
'data'
][
'itemType'
]
in
[
'webpage'
,
'encyclopediaArticle'
,
'blogPost'
]:
txt
+=
getDataFromWebPage
(
item
)
elif
item
[
'data'
][
'itemType'
]
==
'attachment'
:
#with open('tmp/' + item['data']['title'], 'wb') as f:
# f.write(zot.file(item['data']['key']))
print
(
item
)
else
:
print
(
"??"
)
#print(item['data']['itemType'])
with
open
(
'output.tsv'
,
'w'
)
as
f
:
f
.
write
(
txt
)
print
(
"Id:"
)
id
=
input
()
zot
=
zotero
.
Zotero
(
id
,
'user'
)
print
(
"Items (i)/ Collection (c)"
)
t
=
input
()
if
t
==
'i'
:
print
(
'Search :'
)
search
=
input
()
zot
.
add_parameters
(
q
=
search
)
items
=
zot
.
top
()
else
:
docs
=
zot
.
collections
()
tmp
=
{}
print
(
'Collection :'
)
for
doc
in
docs
:
tmp
[
doc
[
'data'
][
'name'
]]
=
doc
[
'data'
][
'key'
]
print
(
doc
[
'data'
][
'name'
])
print
(
"choose collection"
)
col
=
input
()
items
=
[]
for
elem
in
col
.
split
(
' '
):
items
+=
zot
.
collection_items
(
tmp
[
elem
])
txt
=
makeTSV
(
items
)
Conversion/ToTSV/isidoreToTSV/IsidoreAPIToGarganText.py
0 → 100644
View file @
5635c954
import
requests
as
req
import
json
import
sys
from
datetime
import
date
# python3 IsidoreAPIToGarganText search nb_replies language
# ex : python3 IsidoreAPIToGarganText "brain muscle" 100 fra
try
:
search
=
sys
.
argv
[
1
]
replies
=
sys
.
argv
[
2
]
language
=
sys
.
argv
[
3
]
except
:
print
(
"! args error
\n
"
)
sys
.
exit
(
0
)
if
replies
>
1000
:
print
(
"The number of replier must be less than 1000"
)
sys
.
exit
(
0
)
url
=
'https://api.isidore.science/resource/search?q='
+
search
+
'&output=json&replies='
+
replies
+
'&language=http://lexvo.org/id/iso639-3/'
+
language
resp
=
req
.
get
(
url
)
jsontxt
=
json
.
loads
(
resp
.
content
)
docs
=
jsontxt
[
"response"
][
"replies"
][
"content"
][
"reply"
]
# Output File
output
=
open
(
"output.csv"
,
"w"
)
header
=
"title
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
abstract
\t
authors
\t
weight
\n
"
output
.
write
(
header
)
for
doc
in
docs
:
# Title
title
=
doc
[
"isidore"
][
"title"
]
if
(
type
(
title
)
!=
str
):
if
(
type
(
title
)
==
list
):
tmp
=
''
for
lang
in
title
:
if
type
(
lang
)
!=
str
and
lang
[
'@xml:lang'
]
==
language
[:
2
]:
tmp
=
lang
[
'$'
]
if
tmp
==
''
:
if
type
(
title
[
0
])
==
str
:
title
=
title
[
0
]
else
:
title
=
title
[
0
][
'$'
]
else
:
title
=
tmp
else
:
title
=
title
[
'$'
]
# Source
source
=
doc
[
"isidore"
][
"source_info"
][
"sourceName"
][
"$"
]
# Author
if
doc
[
'isidore'
][
'enrichedCreators'
]
!=
[]:
list_author
=
doc
[
"isidore"
][
"enrichedCreators"
][
"creator"
]
authors
=
[]
if
(
type
(
list_author
)
==
list
):
for
author
in
list_author
:
authors
.
append
(
author
[
"@origin"
])
authors
=
';'
.
join
(
authors
)
else
:
authors
=
list_author
[
"@origin"
]
else
:
authors
=
''
#Abstract
if
'abstract'
in
doc
[
'isidore'
]
.
keys
()
and
doc
[
"isidore"
][
"abstract"
]
!=
[]:
abstract
=
doc
[
"isidore"
][
"abstract"
]
else
:
abstract
=
''
if
(
type
(
abstract
)
!=
str
):
if
type
(
abstract
)
==
list
:
tmp
=
''
for
lang
in
abstract
:
if
type
(
lang
)
!=
str
and
lang
[
'@xml:lang'
]
==
language
[:
2
]:
tmp
=
lang
[
'$'
]
if
tmp
==
''
:
if
type
(
abstract
[
0
])
==
str
:
abstract
=
abstract
[
0
]
else
:
abstract
=
abstract
[
0
][
'$'
]
else
:
abstract
=
tmp
else
:
abstract
=
abstract
[
'$'
]
# Publication Date
try
:
pdate
=
'
\t
'
.
join
(
doc
[
"isidore"
][
"date"
][
"normalizedDate"
]
.
split
(
'-'
))
except
Exception
as
e
:
pdate
=
str
(
date
.
today
()
.
year
)
+
'
\t
01
\t
01'
abstract
=
abstract
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
title
=
title
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
source
=
source
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
.
replace
(
'
\n
'
,
''
)
# Output
row
=
str
(
title
)
+
"
\t
"
+
source
+
"
\t
"
+
pdate
+
"
\t
"
+
abstract
+
"
\t
"
+
authors
+
"
\t
"
+
str
(
1
)
+
"
\n
"
output
.
write
(
row
)
Conversion/ToTSV/isidoreToTSV/README.md
0 → 100644
View file @
5635c954
# IsidoreAPIToGargantext
## About The project
IsidoreAPIToGargantext call isidore API to make a research using the parameter given and create a TSV file usable in GraganText
## Usage
```
shell
python3 IsidoreAPIToGargantext.py search replies lang
```
search is what you want tu search in Isidore
replies is the number of reply take fron the answer of Isidore
lang is the language (see note)
Output a TSV legacy corpus named output.tsv
## Date
This script have been last updated the 2023/07/24.
It can be outdated if the futur.
## Note
language | lang | work?
| :--- |:--- |:---
French | fra | fine
English | eng | fine
Deutch | deu | fine
Spanish | spa | fine
Italian | ita | fine
Portuguese | por | fine
Polish | nld | low answer
Russian | rus | low answer
Chiniese | lzh | should work but don't actually
\ No newline at end of file
Streamlit/.streamlit/pages.toml
View file @
5635c954
...
@@ -79,3 +79,7 @@ is_section = true
...
@@ -79,3 +79,7 @@ is_section = true
path
=
"pages/Merge_Term_GarganText.py"
path
=
"pages/Merge_Term_GarganText.py"
name
=
"Merge GarganText Terms"
name
=
"Merge GarganText Terms"
[[pages]]
path
=
"pages/GEXF_To_TermOcc.py"
name
=
"GEXF To Term"
Streamlit/lang/text_GEXFToTermOcc.csv
0 → 100644
View file @
5635c954
locale,key,value
fr,title,"# Term / Occurrence"
en,title,"# Json To TSV"
fr,text,"Transforme un fichier GEXF venant du graphe de GarganText en un fichier TSV de terme et d'occurrence."
en,text,"Transform a GEXF file of a graph from GarganText to a TSV file of term and occurrence."
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Téléchargez votre fichier TSV :"
en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide"
en,error,"Error : the file isn't valid"
\ No newline at end of file
Streamlit/lang/text_IsidoreToGarganText.csv
View file @
5635c954
...
@@ -2,7 +2,7 @@ locale,key,value
...
@@ -2,7 +2,7 @@ locale,key,value
fr,title,"# Isidore vers GarganText"
fr,title,"# Isidore vers GarganText"
en,title,"# Isidore To GarganText"
en,title,"# Isidore To GarganText"
fr,text,"Effectue une recherche Isidore de documents scientifiques et les converti
t
en un fichier TSV."
fr,text,"Effectue une recherche Isidore de documents scientifiques et les converti
r
en un fichier TSV."
en,text,"Do a Isidore scientific documents research and convert it into a TSV file."
en,text,"Do a Isidore scientific documents research and convert it into a TSV file."
fr,keyword,"Mots clés"
fr,keyword,"Mots clés"
...
@@ -23,10 +23,10 @@ en,overload'api,"The API is overloaded, please retry the request in a few second
...
@@ -23,10 +23,10 @@ en,overload'api,"The API is overloaded, please retry the request in a few second
fr,nb_doc,"Nombre de documents : "
fr,nb_doc,"Nombre de documents : "
en,nb_doc,"Number of documents : "
en,nb_doc,"Number of documents : "
fr,perform1,"Pour des raisons de perform
ence
, on limite à "
fr,perform1,"Pour des raisons de perform
ances
, on limite à "
fr,perform2," le nombre
maximum de documents.
"
fr,perform2," le nombre
de documents maximums
"
en,perform1,"For performance reasons, we limit to "
en,perform1,"For performance reasons, we limit to "
en,perform2,"
,the maximum number of documents.
"
en,perform2,"
the maximum number of documents
"
fr,nb_taken,"Nombre de documents à prendre"
fr,nb_taken,"Nombre de documents à prendre"
en,nb_taken,"Number of documents to take into account"
en,nb_taken,"Number of documents to take into account"
...
@@ -35,6 +35,6 @@ fr,createTSV,"Création du fichier TSV (cela peut prendre quelques minutes)"
...
@@ -35,6 +35,6 @@ fr,createTSV,"Création du fichier TSV (cela peut prendre quelques minutes)"
en,createTSV,"Creation of the TSV file (it may take a while)"
en,createTSV,"Creation of the TSV file (it may take a while)"
fr,doc_abstract1,"Il y a "
fr,doc_abstract1,"Il y a "
fr,doc_abstract2," documents qui peuvent ne pas avoir de description."
fr,doc_abstract2," documents qui peuvent ne pas avoir de description
s
."
en,doc_abstract1,"There are "
en,doc_abstract1,"There are "
en,doc_abstract2," documents who may not have an abstract"
en,doc_abstract2," documents who may not have an abstract"
\ No newline at end of file
Streamlit/lang/text_IstexToGarganText.csv
View file @
5635c954
...
@@ -11,7 +11,7 @@ en,file,"Choose a file"
...
@@ -11,7 +11,7 @@ en,file,"Choose a file"
fr,dup1,"Certains fichiers ("
fr,dup1,"Certains fichiers ("
fr,dup2,") ont été retirés pour diverses raisons (fichier au mauvais format, fichiers identiques au regard des données utilisées par GarganText...)"
fr,dup2,") ont été retirés pour diverses raisons (fichier au mauvais format, fichiers identiques au regard des données utilisées par GarganText...)"
en,dup1,"Some file ("
en,dup1,"Some file ("
en,dup2,") have been removed for various reasons (
file with wrong format, file already presen
t...)"
en,dup2,") have been removed for various reasons (
especially indentic file, unusable forma
t...)"
fr,new_file,"Télécharger le fichier TSV :"
fr,new_file,"Télécharger le fichier TSV :"
en,new_file,"Download the TSV file:"
en,new_file,"Download the TSV file:"
...
...
Streamlit/lang/text_PubMedToGarganText.csv
View file @
5635c954
...
@@ -11,5 +11,5 @@ en,file,"Choose a file"
...
@@ -11,5 +11,5 @@ en,file,"Choose a file"
fr,new_file,"Télécharger le fichier TSV :"
fr,new_file,"Télécharger le fichier TSV :"
en,new_file,"Download le TSV file:"
en,new_file,"Download le TSV file:"
fr,error,"Erreur : le fichier n'est pas valide"
fr,error,"Erreur : le fichier n'est pas valide !"
en,error,"Error : the file isn't valid"
en,error,"Error : the file isn't valid !"
\ No newline at end of file
\ No newline at end of file
Streamlit/lang/text_YTBtoTSV.csv
View file @
5635c954
...
@@ -35,5 +35,5 @@ en,loading,"Videos processing : "
...
@@ -35,5 +35,5 @@ en,loading,"Videos processing : "
fr,quantity," sur "
fr,quantity," sur "
en,quantity," out of "
en,quantity," out of "
fr,new_file,"Télécharge
ton
fichier TSV :"
fr,new_file,"Télécharge
z votre
fichier TSV :"
en,new_file,"Download your TSV file :"
en,new_file,"Download your TSV file :"
Streamlit/lang/text_ZoteroToGarganText.csv
View file @
5635c954
...
@@ -14,8 +14,8 @@ en,submit,"Submit"
...
@@ -14,8 +14,8 @@ en,submit,"Submit"
fr,denied,"L'accès au compte n'est pas public, pour le rendre public: https://www.zotero.org/settings/privacy"
fr,denied,"L'accès au compte n'est pas public, pour le rendre public: https://www.zotero.org/settings/privacy"
en,denied,"Account access is not public, to make it public: https://www.zotero.org/settings/privacy"
en,denied,"Account access is not public, to make it public: https://www.zotero.org/settings/privacy"
fr,add_doc,"*Ajoute
r
les documents que vous voulez mettre dans le TSV*"
fr,add_doc,"*Ajoute
z
les documents que vous voulez mettre dans le TSV*"
en,add_doc,"*Add the document that
t
ou want in the TSV*"
en,add_doc,"*Add the document that
y
ou want in the TSV*"
fr,select_all,"Select All"
fr,select_all,"Select All"
en,select_all,"Select All"
en,select_all,"Select All"
...
...
Streamlit/pages/Clean_CSV_to_TSV.py
View file @
5635c954
...
@@ -39,54 +39,29 @@ def getSeparator(file):
...
@@ -39,54 +39,29 @@ def getSeparator(file):
return
'
\t
'
,
False
return
'
\t
'
,
False
def
checkPublicationCase
(
tmp
,
split
,
success
):
def
lowerName
(
name
):
if
split
:
tmp
=
name
if
tmp
[
0
][
0
]
.
isupper
()
or
tmp
[
1
][
0
]
.
isupper
():
return
False
else
:
return
success
if
not
tmp
[
0
][
0
]
.
isupper
()
or
not
tmp
[
1
][
0
]
.
isupper
():
return
False
return
success
def
checkPublication
(
name
,
registeredNames
,
errorMessage
):
tmpName
=
name
if
re
.
search
(
'[a-zA-Z0-9]'
,
name
[
0
])
==
None
:
if
re
.
search
(
'[a-zA-Z0-9]'
,
name
[
0
])
==
None
:
tmpName
=
name
[
1
:]
tmp
=
name
[
1
:]
tmp
=
tmpName
.
split
(
' '
)
if
len
(
tmp
)
<
9
:
success
=
True
return
tmp
.
lower
()
tmp
=
name
.
split
(
' '
)
split
=
False
split
=
False
first
=
""
first
=
""
second
=
""
second
=
""
if
"_"
in
tmp
[
0
]
and
len
(
tmp
)
==
1
:
if
len
(
tmp
)
==
1
and
"_"
in
tmp
[
0
]
:
tmp
=
tmp
[
0
]
.
split
(
'_'
)
tmp
=
tmp
[
0
]
.
split
(
'_'
)
split
=
True
split
=
True
if
len
(
tmp
)
!=
2
:
if
len
(
tmp
)
!=
2
:
success
=
False
return
name
.
lower
()
else
:
else
:
success
=
checkPublicationCase
(
tmp
,
split
,
success
)
first
=
tmp
[
0
][
0
]
.
lower
()
+
tmp
[
0
][
1
:]
first
=
tmp
[
0
][
0
]
.
lower
()
+
tmp
[
0
][
1
:]
second
=
tmp
[
1
][
0
]
.
lower
()
+
tmp
[
1
][
1
:]
second
=
tmp
[
1
][
0
]
.
lower
()
+
tmp
[
1
][
1
:]
if
first
!=
"publication"
or
second
not
in
[
"day"
,
"month"
,
"year"
]:
return
first
+
"_"
+
second
success
=
False
if
not
success
:
errorMessage
+=
"Error at line 1 ! Wrong name : "
+
\
name
+
" is not appropriated !
\n
"
else
:
registeredNames
.
append
(
first
+
"_"
+
second
)
return
success
,
errorMessage
def
checkNameValidity
(
name
,
columnNames
,
registeredNames
,
errorMessage
):
def
checkNameValidity
(
name
,
columnNames
,
registeredNames
,
errorMessage
):
tmpName
=
name
if
name
in
registeredNames
:
if
re
.
search
(
'[a-zA-Z0-9]'
,
name
[
0
])
==
None
:
tmpName
=
name
[
1
:]
if
tmpName
not
in
columnNames
:
errorMessage
+=
"Error at line 1 ! Wrong name : "
+
\
name
+
" is not appropriated !
\n
"
return
False
,
errorMessage
if
tmpName
in
registeredNames
:
errorMessage
+=
"Error at line 1 ! Same name for 2 differents columns!
\n
"
errorMessage
+=
"Error at line 1 ! Same name for 2 differents columns!
\n
"
return
False
,
errorMessage
return
False
,
errorMessage
return
True
,
errorMessage
return
True
,
errorMessage
...
@@ -105,23 +80,30 @@ def checkColumnExistence(registeredNames, errorMessage):
...
@@ -105,23 +80,30 @@ def checkColumnExistence(registeredNames, errorMessage):
return
True
,
errorMessage
return
True
,
errorMessage
def
checkColumnNames
(
name
,
errorMessage
,
registeredNames
,
success
):
def
checkColumnNames
(
name
,
errorMessage
,
registeredNames
,
otherColumns
,
success
):
columnNames
=
[
"authors"
,
"title"
,
"publication_year"
,
columnNames
=
[
"authors"
,
"title"
,
"
source"
,
"
publication_year"
,
"publication_month"
,
"publication_day"
,
"abstract"
,
"source"
]
"publication_month"
,
"publication_day"
,
"abstract"
]
name
=
name
.
replace
(
"
\n
"
,
""
)
name
=
name
.
replace
(
"
\n
"
,
""
)
if
len
(
name
)
>
9
:
tmpSuccess
,
errorMessage
=
checkNameValidity
(
name
,
columnNames
,
registeredNames
,
errorMessage
)
tmpSuccess
,
errorMessage
=
checkPublication
(
if
tmpSuccess
:
name
,
registeredNames
,
errorMessage
)
if
lowerName
(
name
)
in
columnNames
:
else
:
registeredNames
.
append
(
name
)
name
=
name
.
replace
(
" "
,
""
)
else
:
tmpSuccess
,
errorMessage
=
checkNameValidity
(
otherColumns
.
append
(
name
)
name
[
0
]
.
lower
()
+
name
[
1
:],
columnNames
,
registeredNames
,
errorMessage
)
if
success
:
if
tmpSuccess
:
registeredNames
.
append
(
name
[
0
]
.
lower
()
+
name
[
1
:])
if
success
:
success
=
tmpSuccess
success
=
tmpSuccess
return
success
,
errorMessage
,
registeredNames
return
errorMessage
,
registeredNames
,
otherColumns
,
success
def
addColumnsNamestoTSV
(
data
,
registeredNames
,
otherColumns
):
for
name
in
registeredNames
:
if
data
!=
""
:
data
+=
"
\t
"
data
+=
name
for
name
in
otherColumns
:
data
+=
"
\t
"
data
+=
name
return
data
def
getColumnsNames
(
file
,
separator
,
errorMessage
):
def
getColumnsNames
(
file
,
separator
,
errorMessage
):
data
=
""
data
=
""
...
@@ -130,40 +112,21 @@ def getColumnsNames(file, separator, errorMessage):
...
@@ -130,40 +112,21 @@ def getColumnsNames(file, separator, errorMessage):
success
=
True
success
=
True
reader
=
csv
.
DictReader
(
codecs
.
iterdecode
(
reader
=
csv
.
DictReader
(
codecs
.
iterdecode
(
file
,
'utf-8'
),
delimiter
=
separator
)
file
,
'utf-8'
),
delimiter
=
separator
)
columnsName
s
=
[]
othersColumn
s
=
[]
for
row
in
reader
:
for
row
in
reader
:
for
name
,
value
in
row
.
items
():
for
name
,
value
in
row
.
items
():
columnName
=
name
.
replace
(
"
\ufeff
"
,
""
)
columnName
=
name
.
replace
(
"
\ufeff
"
,
""
)
if
(
columnNb
<
7
):
errorMessage
,
registeredNames
,
otherColumns
,
success
=
checkColumnNames
(
success
,
errorMessage
,
registeredNames
=
checkColumnNames
(
name
,
errorMessage
,
registeredNames
,
othersColumns
,
success
)
name
,
errorMessage
,
registeredNames
,
success
)
if
data
!=
""
:
data
+=
"
\t
"
data
+=
columnName
columnNb
+=
1
success
,
errorMessage
=
checkColumnExistence
(
success
,
errorMessage
=
checkColumnExistence
(
registeredNames
,
errorMessage
)
registeredNames
,
errorMessage
)
if
success
:
data
=
addColumnsNamestoTSV
(
data
,
registeredNames
,
otherColumns
)
break
break
data
+=
"
\n
"
data
+=
"
\n
"
return
data
,
success
,
errorMessage
return
data
,
success
,
errorMessage
def
lowerName
(
name
):
tmp
=
name
.
split
(
' '
)
split
=
False
first
=
""
second
=
""
if
len
(
tmp
)
==
1
and
"_"
in
tmp
[
0
]:
tmp
=
tmp
[
0
]
.
split
(
'_'
)
split
=
True
if
len
(
tmp
)
!=
2
:
return
name
.
lower
()
else
:
first
=
tmp
[
0
][
0
]
.
lower
()
+
tmp
[
0
][
1
:]
second
=
tmp
[
1
][
0
]
.
lower
()
+
tmp
[
1
][
1
:]
return
first
+
"_"
+
second
def
checkDate
(
name
,
value
,
success
,
fill
,
csvLine
,
errorMessage
):
def
checkDate
(
name
,
value
,
success
,
fill
,
csvLine
,
errorMessage
):
if
name
in
[
"publication_year"
,
"publication_month"
,
"publication_day"
]:
if
name
in
[
"publication_year"
,
"publication_month"
,
"publication_day"
]:
if
value
==
""
or
value
==
"
\n
"
:
if
value
==
""
or
value
==
"
\n
"
:
...
@@ -210,43 +173,45 @@ def correctedSequence(text):
...
@@ -210,43 +173,45 @@ def correctedSequence(text):
tmp
=
"
\"
"
+
tmp
+
"
\"
"
tmp
=
"
\"
"
+
tmp
+
"
\"
"
return
tmp
return
tmp
def
getContent
(
file
,
separator
,
data
,
success
,
fill
,
errorMessage
):
def
getContent
(
file
,
separator
,
data
,
success
,
fill
,
errorMessage
):
reader
=
csv
.
DictReader
(
codecs
.
iterdecode
(
columnNames
=
[
"authors"
,
"title"
,
"source"
,
"publication_year"
,
file
,
'utf-8'
),
delimiter
=
separator
)
"publication_month"
,
"publication_day"
,
"abstract"
]
csvLine
=
2
csvLine
=
2
columnNb
=
0
reader
=
csv
.
DictReader
(
codecs
.
iterdecode
(
file
,
'utf-8'
),
delimiter
=
separator
)
for
row
in
reader
:
for
row
in
reader
:
tmp
=
""
first
=
True
first
=
True
tsv1
=
""
tsv2
=
""
for
name
,
value
in
row
.
items
():
for
name
,
value
in
row
.
items
():
tmpFill
=
""
tmpFill
=
""
if
not
first
:
if
lowerName
(
name
)
in
columnNames
:
tmp
+=
"
\t
"
if
not
first
:
else
:
tsv1
+=
"
\t
"
first
=
False
if
(
columnNb
<
7
):
success
,
tmpFill
,
errorMessage
=
checkMissing
(
success
,
tmpFill
,
errorMessage
=
checkMissing
(
lowerName
(
name
),
value
,
success
,
fill
,
csvLine
,
errorMessage
)
lowerName
(
name
),
value
,
success
,
fill
,
csvLine
,
errorMessage
)
if
tmpFill
!=
""
:
if
tmpFill
!=
""
:
t
mp
+=
tmpFill
t
sv1
+=
tmpFill
else
:
else
:
success
,
tmpFill
,
errorMessage
=
checkDate
(
success
,
tmpFill
,
errorMessage
=
checkDate
(
lowerName
(
name
),
value
,
success
,
fill
,
csvLine
,
errorMessage
)
lowerName
(
name
),
value
,
success
,
fill
,
csvLine
,
errorMessage
)
tmp
+=
correctedSequence
(
value
)
tsv1
+=
correctedSequence
(
value
)
else
:
else
:
tmp
+=
correctedSequence
(
value
)
success
,
tmpFill
,
errorMessage
=
checkMissing
(
columnNb
+=
1
lowerName
(
name
),
value
,
success
,
fill
,
csvLine
,
errorMessage
)
columnNb
=
0
if
tmpFill
!=
""
:
tsv2
+=
"
\t
"
+
tmpFill
else
:
tsv2
+=
"
\t
"
+
correctedSequence
(
value
)
if
first
:
first
=
False
csvLine
+=
1
csvLine
+=
1
data
+=
t
mp
+
"
\n
"
data
+=
t
sv1
+
tsv2
+
"
\n
"
return
data
[:
-
1
],
success
,
errorMessage
return
data
[:
-
1
],
success
,
errorMessage
# Code End
# Code End
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
session_state
.
fill
=
st
.
checkbox
(
st
.
session_state
.
general_text_dict
[
'fill'
])
st
.
session_state
.
fill
=
st
.
checkbox
(
value
=
True
,
label
=
st
.
session_state
.
general_text_dict
[
'fill'
])
file
=
st
.
file_uploader
(
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"tsv"
,
"csv"
],
key
=
'file'
)
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"tsv"
,
"csv"
],
key
=
'file'
)
...
...
Streamlit/pages/GEXF_To_TermOcc.py
0 → 100644
View file @
5635c954
import
streamlit
as
st
import
networkx
as
nx
import
src.basic
as
tmp
tmp
.
base
(
'GEXFToTermOcc'
)
def
create_file
(
file
):
tmp
=
file
.
getvalue
()
.
decode
(
'utf-8'
)
\
.
replace
(
'version="1.3"'
,
'version="1.2"'
)
\
.
replace
(
'xmlns="http://www.gexf.net/1.3"'
,
'xmlns="http://www.gexf.net/1.2draft"'
)
\
.
replace
(
'xmlns:viz="http://gexf.net/1.3/viz"'
,
'xmlns:viz="http://www.gexf.net/1.2draft/viz"'
)
\
.
replace
(
'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"'
,
'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"'
)
\
.
replace
(
'xsi:schemaLocation="http://gexf.net/1.3 http://gexf.net/1.3/gexf.xsd"'
,
'xsi:schemaLocation="http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd"'
)
\
.
encode
()
file
.
seek
(
0
,
0
)
file
.
write
(
tmp
)
file
.
seek
(
0
,
0
)
tmp
=
nx
.
read_gexf
(
file
,
version
=
'1.2draft'
)
lst
=
[]
for
elem
in
tmp
.
nodes
(
True
):
lst
.
append
((
elem
[
1
][
'label'
],
elem
[
1
][
'viz'
][
'size'
]))
lst
.
sort
(
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
res
=
'mapTerm
\t
occ
\n
'
for
elem
in
lst
:
res
+=
elem
[
0
]
+
'
\t
'
+
str
(
int
(
elem
[
1
]))
+
'
\n
'
return
res
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"gexf"
],
key
=
'file'
)
if
file
:
try
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
'Download TSV'
,
create_file
(
file
),
'output.csv'
)
except
Exception
as
e
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'error'
])
\ No newline at end of file
Streamlit/pages/HAL_To_GarganText.py
View file @
5635c954
...
@@ -171,10 +171,10 @@ if st.session_state.stage_isidore > 0:
...
@@ -171,10 +171,10 @@ if st.session_state.stage_isidore > 0:
form2
.
write
(
st
.
session_state
.
general_text_dict
[
'perform1'
]
+
str
(
form2
.
write
(
st
.
session_state
.
general_text_dict
[
'perform1'
]
+
str
(
limitItems
)
+
st
.
session_state
.
general_text_dict
[
'perform2'
])
limitItems
)
+
st
.
session_state
.
general_text_dict
[
'perform2'
])
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
,
limitItems
)
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
0
,
limitItems
,
10
,
10
)
else
:
else
:
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
,
int
(
st
.
session_state
.
nb_doc
)
)
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
0
,
int
(
st
.
session_state
.
nb_doc
),
10
,
10
)
form2
.
form_submit_button
(
form2
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
set_stage
,
args
=
(
2
,))
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
set_stage
,
args
=
(
2
,))
...
...
Streamlit/pages/Isidore_To_GarganText.py
View file @
5635c954
...
@@ -19,7 +19,7 @@ numberReplies = 500 # Dont' exceed 1 000
...
@@ -19,7 +19,7 @@ numberReplies = 500 # Dont' exceed 1 000
limitItems
=
5000
# Can't be superior of 10 times numberReplies
limitItems
=
5000
# Can't be superior of 10 times numberReplies
retryTime
=
2
retryTime
=
2
## Connect to Isidore API to get the numbers of docs from the research
def
loadApiIsidoreNumberFile
(
search
,
language
):
def
loadApiIsidoreNumberFile
(
search
,
language
):
while
(
True
):
while
(
True
):
url
=
'https://api.isidore.science/resource/search?q='
+
search
+
\
url
=
'https://api.isidore.science/resource/search?q='
+
search
+
\
...
@@ -39,7 +39,7 @@ def loadApiIsidoreNumberFile(search, language):
...
@@ -39,7 +39,7 @@ def loadApiIsidoreNumberFile(search, language):
return
docs
return
docs
## Connect to Isidore API to get the documents from the pages
def
loadApiIsidorePage
(
search
,
language
,
page
):
def
loadApiIsidorePage
(
search
,
language
,
page
):
url
=
'https://api.isidore.science/resource/search?q='
+
search
+
'&output=json&replies='
+
\
url
=
'https://api.isidore.science/resource/search?q='
+
search
+
'&output=json&replies='
+
\
str
(
numberReplies
)
+
'&page='
+
str
(
page
)
+
\
str
(
numberReplies
)
+
'&page='
+
str
(
page
)
+
\
...
@@ -58,6 +58,7 @@ def loadApiIsidorePage(search, language, page):
...
@@ -58,6 +58,7 @@ def loadApiIsidorePage(search, language, page):
def
create_output
(
search
,
language
,
nb_doc
):
def
create_output
(
search
,
language
,
nb_doc
):
output
=
"title
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
abstract
\t
authors
\t
weight
\n
"
output
=
"title
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
abstract
\t
authors
\t
weight
\n
"
nb
=
0
nb
=
0
## nb is used to return ther number of file with
for
i
in
range
(
1
,
nb_doc
//
numberReplies
+
1
):
for
i
in
range
(
1
,
nb_doc
//
numberReplies
+
1
):
while
(
True
):
while
(
True
):
txt
=
loadApiIsidorePage
(
search
,
language
,
i
)
txt
=
loadApiIsidorePage
(
search
,
language
,
i
)
...
@@ -68,6 +69,8 @@ def create_output(search, language, nb_doc):
...
@@ -68,6 +69,8 @@ def create_output(search, language, nb_doc):
tmp
,
nb_tmp
=
createFile
(
txt
,
numberReplies
,
language
)
tmp
,
nb_tmp
=
createFile
(
txt
,
numberReplies
,
language
)
output
+=
tmp
output
+=
tmp
nb
+=
nb_tmp
nb
+=
nb_tmp
## If their is still some document do find (for exampe with 1160 documents, their is still 160 documents to find after the first part)
if
nb_doc
%
numberReplies
!=
0
:
if
nb_doc
%
numberReplies
!=
0
:
while
(
True
):
while
(
True
):
txt
=
loadApiIsidorePage
(
search
,
language
,
nb_doc
//
numberReplies
+
1
)
txt
=
loadApiIsidorePage
(
search
,
language
,
nb_doc
//
numberReplies
+
1
)
...
@@ -151,7 +154,6 @@ def createFile(docs, limit, language):
...
@@ -151,7 +154,6 @@ def createFile(docs, limit, language):
abstract
=
''
abstract
=
''
if
'types'
in
doc
[
'isidore'
]
.
keys
():
if
'types'
in
doc
[
'isidore'
]
.
keys
():
print
(
i
)
if
type
(
doc
[
'isidore'
][
'types'
][
'type'
])
==
str
and
doc
[
'isidore'
][
'types'
][
'type'
]
in
[
'Books'
,
'text'
]:
if
type
(
doc
[
'isidore'
][
'types'
][
'type'
])
==
str
and
doc
[
'isidore'
][
'types'
][
'type'
]
in
[
'Books'
,
'text'
]:
nb
+=
1
nb
+=
1
elif
type
(
doc
[
'isidore'
][
'types'
][
'type'
])
==
dict
and
doc
[
'isidore'
][
'types'
][
'type'
][
'$'
]
in
[
'Books'
,
'text'
]:
elif
type
(
doc
[
'isidore'
][
'types'
][
'type'
])
==
dict
and
doc
[
'isidore'
][
'types'
][
'type'
][
'$'
]
in
[
'Books'
,
'text'
]:
...
@@ -249,7 +251,7 @@ form.form_submit_button(
...
@@ -249,7 +251,7 @@ form.form_submit_button(
# API and Slider
# API and Slider
if
st
.
session_state
.
stage_isidore
>
0
:
if
st
.
session_state
.
stage_isidore
>
0
:
# Only call first time and after
# Only call first time and after
an update in the first form
if
'search'
not
in
st
.
session_state
or
'language'
not
in
st
.
session_state
or
search
!=
st
.
session_state
.
search
or
language
!=
st
.
session_state
.
language
:
if
'search'
not
in
st
.
session_state
or
'language'
not
in
st
.
session_state
or
search
!=
st
.
session_state
.
search
or
language
!=
st
.
session_state
.
language
:
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'load_api'
]):
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'load_api'
]):
nb_doc
=
int
(
loadApiIsidoreNumberFile
(
search
,
lang
[
language
]))
nb_doc
=
int
(
loadApiIsidoreNumberFile
(
search
,
lang
[
language
]))
...
@@ -269,10 +271,10 @@ if st.session_state.stage_isidore > 0:
...
@@ -269,10 +271,10 @@ if st.session_state.stage_isidore > 0:
form2
.
write
(
st
.
session_state
.
general_text_dict
[
'perform1'
]
+
str
(
form2
.
write
(
st
.
session_state
.
general_text_dict
[
'perform1'
]
+
str
(
limitItems
)
+
st
.
session_state
.
general_text_dict
[
'perform2'
])
limitItems
)
+
st
.
session_state
.
general_text_dict
[
'perform2'
])
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
,
limitItems
)
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
0
,
limitItems
,
10
,
10
)
else
:
else
:
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
,
int
(
st
.
session_state
.
nb_doc
)
)
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
0
,
int
(
st
.
session_state
.
nb_doc
),
10
,
10
)
form2
.
form_submit_button
(
form2
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
set_stage
,
args
=
(
2
,))
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
set_stage
,
args
=
(
2
,))
...
...
Streamlit/pages/Istex_To_GarganText.py
View file @
5635c954
...
@@ -5,7 +5,7 @@ Loïc Chapron
...
@@ -5,7 +5,7 @@ Loïc Chapron
import
json
import
json
import
pandas
as
pd
import
pandas
as
pd
import
datetime
from
datetime
import
datetime
import
zipfile
import
zipfile
import
streamlit
as
st
import
streamlit
as
st
import
src.basic
as
tmp
import
src.basic
as
tmp
...
@@ -60,8 +60,6 @@ def read_zip(zip_file):
...
@@ -60,8 +60,6 @@ def read_zip(zip_file):
temp
[
"publication_year"
]
=
article
[
"publicationDate"
][
0
]
temp
[
"publication_year"
]
=
article
[
"publicationDate"
][
0
]
except
:
except
:
temp
[
"publication_year"
]
=
datetime
.
date
.
today
()
.
year
temp
[
"publication_year"
]
=
datetime
.
date
.
today
()
.
year
temp
[
"publication_year"
]
=
article
.
get
(
"publicationDate"
,
datetime
.
date
.
today
()
.
year
)[
0
]
temp
[
"publication_month"
]
=
1
temp
[
"publication_month"
]
=
1
temp
[
"publication_day"
]
=
1
temp
[
"publication_day"
]
=
1
...
@@ -78,7 +76,7 @@ def read_zip(zip_file):
...
@@ -78,7 +76,7 @@ def read_zip(zip_file):
if
(
duplicated
.
any
()):
if
(
duplicated
.
any
()):
dup
+=
duplicated
.
sum
()
dup
+=
duplicated
.
sum
()
output
.
drop
([
'code'
],
axis
=
1
)
output
=
output
.
drop
([
'code'
],
axis
=
1
)
output
=
output
[
~
duplicated
]
output
=
output
[
~
duplicated
]
df
=
pd
.
DataFrame
(
output
)
df
=
pd
.
DataFrame
(
output
)
return
df
.
to_csv
(
index
=
False
,
sep
=
'
\t
'
),
dup
return
df
.
to_csv
(
index
=
False
,
sep
=
'
\t
'
),
dup
...
...
Streamlit/pages/PDF_to_TSV.py
View file @
5635c954
...
@@ -13,6 +13,8 @@ import re
...
@@ -13,6 +13,8 @@ import re
import
chardet
import
chardet
import
pandas
as
pd
import
pandas
as
pd
import
streamlit
as
st
import
streamlit
as
st
import
lib.tika.tika
as
tika
tika
.
initVM
()
from
lib.tika.tika
import
parser
from
lib.tika.tika
import
parser
from
lib.langdetect.langdetect
import
detect
from
lib.langdetect.langdetect
import
detect
from
lib.langdetect.langdetect.lang_detect_exception
import
LangDetectException
from
lib.langdetect.langdetect.lang_detect_exception
import
LangDetectException
...
@@ -136,7 +138,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day
...
@@ -136,7 +138,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day
count
=
1
count
=
1
languages
=
{}
languages
=
{}
while
n
<
nbLines
-
2
:
while
n
<
nbLines
-
2
:
doc
=
"
\n
"
.
join
(
abstract
[
n
:
n
+
9
])
.
replace
(
"�"
,
""
)
doc
=
"
\n
"
.
join
(
abstract
[
n
:
n
+
9
])
.
replace
(
"�"
,
""
)
.
replace
(
""
,
""
)
title
=
source
+
" : Part "
+
str
(
count
)
title
=
source
+
" : Part "
+
str
(
count
)
tsv
+=
correctedSequence
(
author
,
False
)
+
"
\t
"
+
correctedSequence
(
tsv
+=
correctedSequence
(
author
,
False
)
+
"
\t
"
+
correctedSequence
(
source
,
False
)
+
"
\t
"
+
year
+
"
\t
"
+
month
+
"
\t
"
+
day
+
"
\t
"
source
,
False
)
+
"
\t
"
+
year
+
"
\t
"
+
month
+
"
\t
"
+
day
+
"
\t
"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment