Merge branch 'dev' into 'master'

Dev See merge request athomas/gargantexternal-tools!13

Merge branch 'dev' into 'master'
Dev See merge request athomas/gargantexternal-tools!13
5635c954 · Anne-Laure Thomas Derepas · 41e00418 · 22967fb8 · 5635c954 · 5635c954
Commit 5635c954 authored 1 year ago by Anne-Laure Thomas Derepas
21 changed files
--- a/Conversion/ToTSV/IsTexToGarganText/Istex2ggtx.py
+++ b/Conversion/ToTSV/IsTexToGarganText/Istex2ggtx.py
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[74]:
+
+
+import json
+import pandas as pd
+import numpy as np
+import glob
+import sys
+import datetime
+
+
+# In[80]:
+
+
+import zipfile
+
+
+# In[83]:
+
+input_file=sys.argv[1]
+output_file=sys.argv[2]
+
+
+# In[84]:
+
+
+#list_articles=glob.glob("tmp/*/*.json")
+
+
+# In[85]:
+
+
+output=[]
+with zipfile.ZipFile(input_file, 'r') as zip_ref:
+    for file in zip_ref.namelist():
+        if file.split('.')[1] != 'json' or file.split('.')[0] == 'manifest':
+            continue
+        try:
+            article=json.load(zip_ref.open(file))
+
+            temp={}
+            temp["title"]=article.get("title","").encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ")
+            temp["abstract"]=article.get("abstract","").encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ")
+
+            authors=""
+            for author in article.get("author",[]):
+                authors+=author["name"]+", "
+            authors=authors[:-2]
+
+            temp["code"] = article.get("_id")
+            temp["authors"]=authors.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ").replace(",", ";")
+            temp["source"]=article["host"]["title"].encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ")
+            temp["publication_year"]=article.get("publicationDate", datetime.date.today().year)
+            temp["publication_month"]=1
+            temp["publication_day"]=1
+            
+
+            output.append(temp)
+        except Exception as e:
+            print(file,e)
+
+
+# In[86]:
+
+
+output=pd.DataFrame(output)
+tmp = output.size
+duplicated = output['title'].str.lower().replace(",", "", regex=True).duplicated()
+
+if (duplicated.any()):
+    print("\nQuelques fichiers n'ont pas été introduits dans le TSV car ils pourraient apparaitre plusieurs fois:")
+    for i in range(0, output["title"].size - 1):
+        if (duplicated[i]):
+            print("\t" + output["code"][i] + " " + output["title"][i])
+output.drop(['code'], axis=1)
+output = output[~duplicated]
+
+# In[87]:
+
+
+output.to_csv(output_file,sep='\t',index=False)
+print("")
+
+
+# In[ ]:
--- a/Conversion/ToTSV/IsTexToGarganText/README.md
+++ b/Conversion/ToTSV/IsTexToGarganText/README.md
+# IstexToGargantext
+
+## About The project
+
+IstexToGargantext convert a zip file from Istex into a TSV file for GarganText
+
+## Usage
+```shell
+python3 Istex2ggtx.py file.zip
+```
+
+## Date
+
+This script have been last updated the 2023/07/24.
+It can be outdated if the futur.
--- a/Conversion/ToTSV/IsTexToGarganText/sample/istex-subset-2023-07-17.zip
+++ b/Conversion/ToTSV/IsTexToGarganText/sample/istex-subset-2023-07-17.zip
--- a/Conversion/ToTSV/IsTexToGarganText/sample/istex-subset-2023-07-19.zip
+++ b/Conversion/ToTSV/IsTexToGarganText/sample/istex-subset-2023-07-19.zip
--- a/Conversion/ToTSV/ZoteroToGarganText/README.md
+++ b/Conversion/ToTSV/ZoteroToGarganText/README.md
+# ZoteroToGargantext
+
+## About The project
+
+ZoteroToGarganText isn't usable right now, need modification to transform txt and pdf file from zotero into tsv for gargantext
+
+## Usage
+```shell
+python3 ZoteroToGarganText.py 
+```
+
--- a/Conversion/ToTSV/ZoteroToGarganText/ZoteroToGarganText.py
+++ b/Conversion/ToTSV/ZoteroToGarganText/ZoteroToGarganText.py
+from pyzotero import zotero
+from datetime import date
+
+
+def getDataFromWebPage(item):
+    # Title
+
+    title = item['data']['title']
+
+    # Authors
+
+    if 'creators' in item['data'].keys():
+        authors = []
+        for author in item['data']['creators']:
+            authors.append(author['lastName'])
+        authors = ';'.join(authors)
+    else:
+        authors = ''
+
+    # Source
+
+    source = item['data']['url']
+
+    # Abstract
+    if 'abstractNote' in item['data'].keys():
+        abstract = item['data']['abstractNote']
+    else:
+        abstract = ''
+
+    # Date
+
+    if 'date' in item['data'].keys() and item['data']['date'] != '':
+        pdate = item['data']['date'].split('-')
+        pdate[2] = pdate[2].split('T')[0]
+        pdate = '\t'.join(pdate)
+    else:
+        pdate = str(date.today().year) + '\t1\t1'
+
+    abstract = abstract.encode(encoding='UTF-8', errors='ignore').decode(
+        "utf-8").replace('\t', '').replace('"', '').replace('\n', '')
+    title = title.encode(encoding='UTF-8', errors='ignore').decode(
+        "utf-8").replace('\t', '').replace('"', '').replace('\n', '')
+    source = source.encode(encoding='UTF-8', errors='ignore').decode(
+        "utf-8").replace('\t', '').replace('"', '').replace('\n', '').replace('\n', '')
+
+    # Output
+    return str(title) + "\t" + source + "\t" + str(pdate) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
+
+
+def makeTSV(items):
+    txt = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
+    for item in items:
+        if item['data']['itemType'] in ['webpage', 'encyclopediaArticle', 'blogPost']:
+            txt += getDataFromWebPage(item)
+        elif item['data']['itemType'] == 'attachment':
+            #with open('tmp/' + item['data']['title'], 'wb') as f:
+            #    f.write(zot.file(item['data']['key']))
+            print(item)
+        else:
+            print("??")
+        #print(item['data']['itemType'])
+    with open('output.tsv', 'w') as f:
+        f.write(txt)
+
+print("Id:")
+id = input()
+zot = zotero.Zotero(id, 'user')
+print("Items (i)/ Collection (c)")
+t = input()
+if t == 'i':
+    print('Search :')
+    search = input()
+    zot.add_parameters(q=search)
+    items = zot.top()
+
+else:
+    docs = zot.collections()
+    tmp = {}
+    print('Collection :')
+    for doc in docs:
+        tmp[doc['data']['name']] = doc['data']['key']
+        print(doc['data']['name'])
+    print("choose collection")
+    col = input()
+    items = []
+    for elem in col.split(' '):
+        items += zot.collection_items(tmp[elem])
+    
+txt = makeTSV(items)
+
+
+
+    
+
+
--- a/Conversion/ToTSV/isidoreToTSV/IsidoreAPIToGarganText.py
+++ b/Conversion/ToTSV/isidoreToTSV/IsidoreAPIToGarganText.py
+import requests as req
+import json
+import sys
+from datetime import date
+
+# python3 IsidoreAPIToGarganText search nb_replies language
+# ex : python3 IsidoreAPIToGarganText "brain muscle" 100 fra
+
+try :
+    search = sys.argv[1]
+    replies = sys.argv[2]
+    language = sys.argv[3]
+except :
+    print ("! args error\n")
+    sys.exit(0)
+
+if replies > 1000:
+    print("The number of replier must be less than 1000")
+    sys.exit(0)
+
+url = 'https://api.isidore.science/resource/search?q=' + search + '&output=json&replies=' + replies + '&language=http://lexvo.org/id/iso639-3/' + language
+resp = req.get(url)
+jsontxt = json.loads(resp.content)
+docs = jsontxt["response"]["replies"]["content"]["reply"]
+
+
+# Output File
+output = open("output.csv", "w")
+header = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
+output.write(header)
+
+for doc in docs:
+
+    # Title
+    title = doc["isidore"]["title"]
+    if (type(title) != str):
+        if(type(title) == list):
+            tmp = ''
+            for lang in title:
+                if type(lang) != str and lang['@xml:lang'] == language[:2]:
+                    tmp = lang['$']
+            if tmp == '':
+                if type(title[0]) == str:
+                    title = title[0]
+                else:
+                    title = title[0]['$']
+            else:
+                title = tmp
+            
+        else:
+            title = title['$']
+
+    # Source
+    source =doc["isidore"]["source_info"]["sourceName"]["$"]
+
+    # Author
+    if doc['isidore']['enrichedCreators'] != []:
+        list_author = doc["isidore"]["enrichedCreators"]["creator"]
+        authors = []
+        if(type(list_author) == list):
+            for author in list_author:
+                authors.append(author["@origin"])
+            authors = ';'.join(authors)
+        else:
+            authors = list_author["@origin"]
+    else:
+        authors = ''
+
+    #Abstract
+    if 'abstract' in doc['isidore'].keys() and doc["isidore"]["abstract"] != []:
+        abstract = doc["isidore"]["abstract"]
+    else:
+        abstract = ''
+    if (type(abstract) != str):
+        if type(abstract) == list:
+            tmp = ''
+            for lang in abstract:
+                if type(lang) != str and lang['@xml:lang'] == language[:2]:
+                    tmp = lang['$']
+            if tmp == '':
+                if type(abstract[0]) == str:
+                    abstract = abstract[0]
+                else:
+                    abstract = abstract[0]['$']
+            else:
+                abstract = tmp
+        else :
+            abstract = abstract['$']
+
+    # Publication Date
+    try:
+        pdate = '\t'.join(doc["isidore"]["date"]["normalizedDate"].split('-'))
+    except Exception as e:
+        pdate = str(date.today().year) + '\t01\t01'
+
+    abstract = abstract.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '')
+    title = title.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '')
+    source = source.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '').replace('\n', '')
+
+    # Output
+    row = str(title) + "\t" + source + "\t" + pdate + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
+    output.write(row)
--- a/Conversion/ToTSV/isidoreToTSV/README.md
+++ b/Conversion/ToTSV/isidoreToTSV/README.md
+# IsidoreAPIToGargantext
+
+## About The project
+
+IsidoreAPIToGargantext call isidore API to make a research using the parameter given and create a TSV file usable in GraganText
+
+## Usage
+```shell
+python3 IsidoreAPIToGargantext.py search replies lang
+```
+
+search is what you want tu search in Isidore
+
+replies is the number of reply take fron the answer of Isidore
+
+lang is the language (see note)
+
+Output a TSV legacy corpus named output.tsv
+
+
+## Date
+
+This script have been last updated the 2023/07/24.
+It can be outdated if the futur.
+
+## Note
+
+language | lang | work?
+| :--- |:--- |:---
+French | fra | fine
+English | eng | fine
+Deutch | deu | fine
+Spanish | spa | fine
+Italian | ita | fine
+Portuguese | por | fine
+Polish | nld | low answer
+Russian | rus | low answer
+Chiniese | lzh | should work but don't actually
\ No newline at end of file
--- a/Streamlit/.streamlit/pages.toml
+++ b/Streamlit/.streamlit/pages.toml
@@ -79,3 +79,7 @@ is_section = true
 path = "pages/Merge_Term_GarganText.py"
 name = "Merge GarganText Terms"

+[[pages]]
+path = "pages/GEXF_To_TermOcc.py"
+name = "GEXF To Term"
+
--- a/Streamlit/lang/text_GEXFToTermOcc.csv
+++ b/Streamlit/lang/text_GEXFToTermOcc.csv
+locale,key,value
+fr,title,"# Term / Occurrence"
+en,title,"# Json To TSV"
+
+fr,text,"Transforme un fichier GEXF venant du graphe de GarganText en un fichier TSV de terme et d'occurrence."
+en,text,"Transform a GEXF file of a graph from GarganText to a TSV file of term and occurrence."
+
+fr,file,"Choisir un fichier"
+en,file,"Choose a file"
+
+fr,new_file,"Téléchargez votre fichier TSV :"
+en,new_file,"Download your TSV file:"
+
+fr,error,"Erreur : le fichier n'est pas valide"
+en,error,"Error : the file isn't valid"
\ No newline at end of file
--- a/Streamlit/lang/text_IsidoreToGarganText.csv
+++ b/Streamlit/lang/text_IsidoreToGarganText.csv
@@ -2,7 +2,7 @@ locale,key,value
 fr,title,"# Isidore vers GarganText"
 en,title,"# Isidore To GarganText"

-fr,text,"Effectue une recherche Isidore de documents scientifiques et les convertit en un fichier TSV." 
+fr,text,"Effectue une recherche Isidore de documents scientifiques et les convertir en un fichier TSV." 
 en,text,"Do a Isidore scientific documents research and convert it into a TSV file."

 fr,keyword,"Mots clés"
@@ -23,10 +23,10 @@ en,overload'api,"The API is overloaded, please retry the request in a few second
 fr,nb_doc,"Nombre de documents : "
 en,nb_doc,"Number of documents : "

-fr,perform1,"Pour des raisons de performence, on limite à "
-fr,perform2," le nombre maximum de documents."
+fr,perform1,"Pour des raisons de performances, on limite à "
+fr,perform2," le nombre de documents maximums"
 en,perform1,"For performance reasons, we limit to "
-en,perform2," ,the maximum number of documents."
+en,perform2," the maximum number of documents"

 fr,nb_taken,"Nombre de documents à prendre"
 en,nb_taken,"Number of documents to take into account"
@@ -35,6 +35,6 @@ fr,createTSV,"Création du fichier TSV (cela peut prendre quelques minutes)"
 en,createTSV,"Creation of the TSV file (it may take a while)"

 fr,doc_abstract1,"Il y a "
-fr,doc_abstract2," documents qui peuvent ne pas avoir de description."
+fr,doc_abstract2," documents qui peuvent ne pas avoir de descriptions."
 en,doc_abstract1,"There are "
 en,doc_abstract2," documents who may not have an abstract"
\ No newline at end of file
--- a/Streamlit/lang/text_IstexToGarganText.csv
+++ b/Streamlit/lang/text_IstexToGarganText.csv
@@ -11,7 +11,7 @@ en,file,"Choose a file"
 fr,dup1,"Certains fichiers ("
 fr,dup2,") ont été retirés pour diverses raisons (fichier au mauvais format, fichiers identiques au regard des données utilisées par GarganText...)"
 en,dup1,"Some file ("
-en,dup2,") have been removed for various reasons (file with wrong format, file already present...)"
+en,dup2,") have been removed for various reasons (especially indentic file, unusable format...)"

 fr,new_file,"Télécharger le fichier TSV :"
 en,new_file,"Download the TSV file:"

--- a/Streamlit/lang/text_PubMedToGarganText.csv
+++ b/Streamlit/lang/text_PubMedToGarganText.csv
@@ -11,5 +11,5 @@ en,file,"Choose a file"
 fr,new_file,"Télécharger le fichier TSV :"
 en,new_file,"Download le TSV file:"

-fr,error,"Erreur : le fichier n'est pas valide"
-en,error,"Error : the file isn't valid"
\ No newline at end of file
+fr,error,"Erreur : le fichier n'est pas valide !"
+en,error,"Error : the file isn't valid !"
\ No newline at end of file
--- a/Streamlit/lang/text_YTBtoTSV.csv
+++ b/Streamlit/lang/text_YTBtoTSV.csv
@@ -35,5 +35,5 @@ en,loading,"Videos processing : "
 fr,quantity," sur "
 en,quantity," out of "

-fr,new_file,"Télécharge ton fichier TSV :"
+fr,new_file,"Téléchargez votre fichier TSV :"
 en,new_file,"Download your TSV file :"
--- a/Streamlit/lang/text_ZoteroToGarganText.csv
+++ b/Streamlit/lang/text_ZoteroToGarganText.csv
@@ -14,8 +14,8 @@ en,submit,"Submit"
 fr,denied,"L'accès au compte n'est pas public, pour le rendre public: https://www.zotero.org/settings/privacy"
 en,denied,"Account access is not public, to make it public: https://www.zotero.org/settings/privacy"

-fr,add_doc,"*Ajouter les documents que vous voulez mettre dans le TSV*"
-en,add_doc,"*Add the document that tou want in the TSV*"
+fr,add_doc,"*Ajoutez les documents que vous voulez mettre dans le TSV*"
+en,add_doc,"*Add the document that you want in the TSV*"

 fr,select_all,"Select All"
 en,select_all,"Select All"

--- a/Streamlit/pages/Clean_CSV_to_TSV.py
+++ b/Streamlit/pages/Clean_CSV_to_TSV.py
@@ -39,54 +39,29 @@ def getSeparator(file):
    return '\t', False


-def checkPublicationCase(tmp, split, success):
-    if split:
-        if tmp[0][0].isupper() or tmp[1][0].isupper():
-            return False
-        else:
-            return success
-    if not tmp[0][0].isupper() or not tmp[1][0].isupper():
-        return False
-    return success
-
-
-def checkPublication(name, registeredNames, errorMessage):
-    tmpName = name
+def lowerName(name):
+    tmp = name
    if re.search('[a-zA-Z0-9]', name[0]) == None:
-        tmpName = name[1:]
-    tmp = tmpName.split(' ')
-    success = True
+        tmp = name[1:]
+    if len(tmp) < 9:
+        return tmp.lower()
+    tmp = name.split(' ')
    split = False
    first = ""
    second = ""
-    if "_" in tmp[0] and len(tmp) == 1:
+    if len(tmp) == 1 and "_" in tmp[0]:
        tmp = tmp[0].split('_')
        split = True
    if len(tmp) != 2:
-        success = False
+        return name.lower()
    else:
-        success = checkPublicationCase(tmp, split, success)
        first = tmp[0][0].lower() + tmp[0][1:]
        second = tmp[1][0].lower() + tmp[1][1:]
-        if first != "publication" or second not in ["day", "month", "year"]:
-            success = False
-    if not success:
-        errorMessage += "Error at line 1 ! Wrong name : " + \
-            name + " is not appropriated ! \n"
-    else:
-        registeredNames.append(first + "_" + second)
-    return success, errorMessage
+        return first + "_" + second


 def checkNameValidity(name, columnNames, registeredNames, errorMessage):
-    tmpName = name
-    if re.search('[a-zA-Z0-9]', name[0]) == None:
-        tmpName = name[1:]
-    if tmpName not in columnNames:
-        errorMessage += "Error at line 1 ! Wrong name : " + \
-            name + " is not appropriated ! \n"
-        return False, errorMessage
-    if tmpName in registeredNames:
+    if name in registeredNames:
        errorMessage += "Error at line 1 ! Same name for 2 differents columns! \n"
        return False, errorMessage
    return True, errorMessage
@@ -105,23 +80,30 @@ def checkColumnExistence(registeredNames, errorMessage):
    return True, errorMessage


-def checkColumnNames(name, errorMessage, registeredNames, success):
-    columnNames = ["authors", "title", "publication_year",
-                   "publication_month", "publication_day", "abstract", "source"]
+def checkColumnNames(name, errorMessage, registeredNames, otherColumns, success):
+    columnNames = ["authors", "title", "source", "publication_year",
+                   "publication_month", "publication_day", "abstract"]
    name = name.replace("\n", "")
-    if len(name) > 9:
-        tmpSuccess, errorMessage = checkPublication(
-            name, registeredNames, errorMessage)
-    else:
-        name = name.replace(" ", "")
-        tmpSuccess, errorMessage = checkNameValidity(
-            name[0].lower() + name[1:], columnNames, registeredNames, errorMessage)
-        if tmpSuccess:
-            registeredNames.append(name[0].lower() + name[1:])
-    if success:
+    tmpSuccess, errorMessage = checkNameValidity(name, columnNames, registeredNames, errorMessage)
+    if tmpSuccess:
+        if lowerName(name) in columnNames:
+            registeredNames.append(name)
+        else :
+            otherColumns.append(name)
+    if success :
        success = tmpSuccess
-    return success, errorMessage, registeredNames
+    return errorMessage, registeredNames, otherColumns, success
+

+def addColumnsNamestoTSV(data, registeredNames, otherColumns):
+    for name in registeredNames :
+        if data != "":
+            data += "\t"
+        data += name
+    for name in otherColumns :
+        data += "\t"
+        data += name
+    return data

 def getColumnsNames(file, separator, errorMessage):
    data = ""
@@ -130,40 +112,21 @@ def getColumnsNames(file, separator, errorMessage):
    success = True
    reader = csv.DictReader(codecs.iterdecode(
        file, 'utf-8'), delimiter=separator)
-    columnsNames = []
+    othersColumns = []
    for row in reader:
        for name, value in row.items():
            columnName = name.replace("\ufeff", "")
-            if (columnNb < 7):
-                success, errorMessage, registeredNames = checkColumnNames(
-                    name, errorMessage, registeredNames, success)
-            if data != "":
-                data += "\t"
-            data += columnName
-            columnNb += 1
+            errorMessage, registeredNames, otherColumns, success = checkColumnNames(
+                name, errorMessage, registeredNames, othersColumns, success)
        success, errorMessage = checkColumnExistence(
            registeredNames, errorMessage)
+        if success :
+            data = addColumnsNamestoTSV(data, registeredNames, otherColumns)
        break
    data += "\n"
    return data, success, errorMessage


-def lowerName(name):
-    tmp = name.split(' ')
-    split = False
-    first = ""
-    second = ""
-    if len(tmp) == 1 and "_" in tmp[0]:
-        tmp = tmp[0].split('_')
-        split = True
-    if len(tmp) != 2:
-        return name.lower()
-    else:
-        first = tmp[0][0].lower() + tmp[0][1:]
-        second = tmp[1][0].lower() + tmp[1][1:]
-        return first + "_" + second
-
-
 def checkDate(name, value, success, fill, csvLine, errorMessage):
    if name in ["publication_year", "publication_month", "publication_day"]:
        if value == "" or value == "\n":
@@ -210,43 +173,45 @@ def correctedSequence(text):
        tmp = "\"" + tmp + "\""
    return tmp

-
 def getContent(file, separator, data, success, fill, errorMessage):
-    reader = csv.DictReader(codecs.iterdecode(
-        file, 'utf-8'), delimiter=separator)
+    columnNames = ["authors", "title", "source", "publication_year",
+                   "publication_month", "publication_day", "abstract"] 
    csvLine = 2
-    columnNb = 0
+    reader = csv.DictReader(codecs.iterdecode(file, 'utf-8'), delimiter=separator)
    for row in reader:
-        tmp = ""
        first = True
+        tsv1 = ""
+        tsv2 = ""
        for name, value in row.items():
            tmpFill = ""
-            if not first:
-                tmp += "\t"
-            else:
-                first = False
-            if (columnNb < 7):
+            if lowerName(name) in columnNames:
+                if not first :
+                    tsv1 += "\t"
                success, tmpFill, errorMessage = checkMissing(
                    lowerName(name), value, success, fill, csvLine, errorMessage)
                if tmpFill != "":
-                    tmp += tmpFill
+                    tsv1 += tmpFill
                else:
                    success, tmpFill, errorMessage = checkDate(
                        lowerName(name), value, success, fill, csvLine, errorMessage)
-                    tmp += correctedSequence(value)
-            else:
-                tmp += correctedSequence(value)
-            columnNb += 1
-        columnNb = 0
+                    tsv1 += correctedSequence(value)
+            else :
+                success, tmpFill, errorMessage = checkMissing(
+                    lowerName(name), value, success, fill, csvLine, errorMessage)
+                if tmpFill != "":
+                    tsv2 +="\t" + tmpFill
+                else:
+                    tsv2 += "\t" + correctedSequence(value)
+            if first:
+                first = False
        csvLine += 1
-        data += tmp + "\n"
+        data += tsv1 + tsv2 + "\n"
    return data[:-1], success, errorMessage

-
 # Code End

 st.write(st.session_state.general_text_dict['text'])
-st.session_state.fill = st.checkbox(st.session_state.general_text_dict['fill'])
+st.session_state.fill = st.checkbox(value = True, label = st.session_state.general_text_dict['fill'])
 file = st.file_uploader(
    st.session_state.general_text_dict['file'], type=["tsv", "csv"], key='file')


--- a/Streamlit/pages/GEXF_To_TermOcc.py
+++ b/Streamlit/pages/GEXF_To_TermOcc.py
+import streamlit as st
+import networkx as nx
+import src.basic as tmp
+
+tmp.base('GEXFToTermOcc')
+
+
+def create_file(file):
+    tmp = file.getvalue().decode('utf-8') \
+        .replace('version="1.3"', 'version="1.2"') \
+        .replace('xmlns="http://www.gexf.net/1.3"', 'xmlns="http://www.gexf.net/1.2draft"') \
+        .replace('xmlns:viz="http://gexf.net/1.3/viz"', 'xmlns:viz="http://www.gexf.net/1.2draft/viz"') \
+        .replace('xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"', 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"') \
+        .replace('xsi:schemaLocation="http://gexf.net/1.3 http://gexf.net/1.3/gexf.xsd"', 'xsi:schemaLocation="http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd"') \
+        .encode()
+    file.seek(0,0)
+    file.write(tmp)
+    file.seek(0,0)
+    
+    tmp = nx.read_gexf(file, version='1.2draft')
+
+    lst=[]
+    for elem in tmp.nodes(True):
+        lst.append((elem[1]['label'],elem[1]['viz']['size']))
+
+    lst.sort(key= lambda x: x[1], reverse=True)
+
+    res = 'mapTerm\tocc\n'
+    for elem in lst:
+        res += elem[0] + '\t' + str(int(elem[1])) + '\n'
+    return res
+
+st.write(st.session_state.general_text_dict['text'])
+file = st.file_uploader(st.session_state.general_text_dict['file'],type=["gexf"],key='file')
+
+if file:
+    try:
+        st.write(st.session_state.general_text_dict['new_file'])
+        st.download_button('Download TSV', create_file(file), 'output.csv')
+    except Exception as e:
+        st.write(st.session_state.general_text_dict['error'])
+    
+    
\ No newline at end of file
--- a/Streamlit/pages/HAL_To_GarganText.py
+++ b/Streamlit/pages/HAL_To_GarganText.py
@@ -171,10 +171,10 @@ if st.session_state.stage_isidore > 0:
            form2.write(st.session_state.general_text_dict['perform1'] + str(
                limitItems) + st.session_state.general_text_dict['perform2'])
            st.session_state.nb_wanted = form2.slider(
-                st.session_state.general_text_dict['nb_taken'], 1, limitItems)
+                st.session_state.general_text_dict['nb_taken'], 10, limitItems, 10, 10)
        else:
            st.session_state.nb_wanted = form2.slider(
-                st.session_state.general_text_dict['nb_taken'], 1, int(st.session_state.nb_doc))
+                st.session_state.general_text_dict['nb_taken'], 10, int(st.session_state.nb_doc), 10, 10)
        form2.form_submit_button(
            st.session_state.general_text_dict['submit'], on_click=set_stage, args=(2,))


--- a/Streamlit/pages/Isidore_To_GarganText.py
+++ b/Streamlit/pages/Isidore_To_GarganText.py
@@ -19,7 +19,7 @@ numberReplies = 500  # Dont' exceed 1 000
 limitItems = 5000  # Can't be superior of 10 times numberReplies
 retryTime = 2

-
+## Connect to Isidore API to get the numbers of docs from the research
 def loadApiIsidoreNumberFile(search, language):
    while (True):
        url = 'https://api.isidore.science/resource/search?q=' + search + \
@@ -39,7 +39,7 @@ def loadApiIsidoreNumberFile(search, language):

    return docs

-
+## Connect to Isidore API to get the documents from the pages
 def loadApiIsidorePage(search, language, page):
    url = 'https://api.isidore.science/resource/search?q=' + search + '&output=json&replies=' + \
        str(numberReplies) + '&page=' + str(page) + \
@@ -58,6 +58,7 @@ def loadApiIsidorePage(search, language, page):
 def create_output(search, language, nb_doc):
    output = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
    nb = 0
+    ## nb is used to return ther number of file with 
    for i in range(1, nb_doc//numberReplies + 1):
        while (True):
            txt = loadApiIsidorePage(search, language, i)
@@ -68,6 +69,8 @@ def create_output(search, language, nb_doc):
        tmp, nb_tmp = createFile(txt, numberReplies, language)
        output += tmp
        nb += nb_tmp
+    
+    ## If their is still some document do find (for exampe with 1160 documents, their is still 160 documents to find after the first part)
    if nb_doc % numberReplies != 0:
        while (True):
            txt = loadApiIsidorePage(search, language, nb_doc//numberReplies + 1)
@@ -151,7 +154,6 @@ def createFile(docs, limit, language):
                    abstract = ''

        if 'types' in doc['isidore'].keys():
-            print(i)
            if type(doc['isidore']['types']['type']) == str and doc['isidore']['types']['type'] in ['Books', 'text']:
                nb += 1
            elif type(doc['isidore']['types']['type']) == dict and doc['isidore']['types']['type']['$'] in ['Books', 'text']:
@@ -249,7 +251,7 @@ form.form_submit_button(
 # API and Slider
 if st.session_state.stage_isidore > 0:

-    # Only call first time and after
+    # Only call first time and after an update in the first form
    if 'search' not in st.session_state or 'language' not in st.session_state or search != st.session_state.search or language != st.session_state.language:
        with st.spinner(st.session_state.general_text_dict['load_api']):
            nb_doc = int(loadApiIsidoreNumberFile(search, lang[language]))
@@ -269,10 +271,10 @@ if st.session_state.stage_isidore > 0:
            form2.write(st.session_state.general_text_dict['perform1'] + str(
                limitItems) + st.session_state.general_text_dict['perform2'])
            st.session_state.nb_wanted = form2.slider(
-                st.session_state.general_text_dict['nb_taken'], 1, limitItems)
+                st.session_state.general_text_dict['nb_taken'], 10, limitItems, 10, 10)
        else:
            st.session_state.nb_wanted = form2.slider(
-                st.session_state.general_text_dict['nb_taken'], 1, int(st.session_state.nb_doc))
+                st.session_state.general_text_dict['nb_taken'], 10, int(st.session_state.nb_doc), 10, 10)
        form2.form_submit_button(
            st.session_state.general_text_dict['submit'], on_click=set_stage, args=(2,))


--- a/Streamlit/pages/Istex_To_GarganText.py
+++ b/Streamlit/pages/Istex_To_GarganText.py
@@ -5,7 +5,7 @@ Loïc Chapron

 import json
 import pandas as pd
-import datetime
+from datetime import datetime
 import zipfile
 import streamlit as st
 import src.basic as tmp
@@ -60,8 +60,6 @@ def read_zip(zip_file):
                    temp["publication_year"] = article["publicationDate"][0]
                except:
                    temp["publication_year"] = datetime.date.today().year
-                temp["publication_year"] = article.get(
-                    "publicationDate", datetime.date.today().year)[0]
                temp["publication_month"] = 1
                temp["publication_day"] = 1

@@ -78,7 +76,7 @@ def read_zip(zip_file):

    if (duplicated.any()):
        dup += duplicated.sum()
-    output.drop(['code'], axis=1)
+    output = output.drop(['code'], axis=1)
    output = output[~duplicated]
    df = pd.DataFrame(output)
    return df.to_csv(index=False, sep='\t'), dup

--- a/Streamlit/pages/PDF_to_TSV.py
+++ b/Streamlit/pages/PDF_to_TSV.py
@@ -13,6 +13,8 @@ import re
 import chardet
 import pandas as pd
 import streamlit as st
+import lib.tika.tika as tika
+tika.initVM()
 from lib.tika.tika import parser
 from lib.langdetect.langdetect import detect
 from lib.langdetect.langdetect.lang_detect_exception import LangDetectException
@@ -136,7 +138,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day
    count = 1
    languages = {}
    while n < nbLines - 2:
-        doc = "\n".join(abstract[n: n + 9]).replace("�", "")
+        doc = "\n".join(abstract[n: n + 9]).replace("�", "").replace("", "")
        title = source + " : Part " + str(count)
        tsv += correctedSequence(author, False) + "\t" + correctedSequence(
            source, False) + "\t" + year + "\t" + month + "\t" + day + "\t"