Commit be50ef3c authored by Nicolas Atrax's avatar Nicolas Atrax

Update PDF_to_TSV.py

parent 450d9bcf
...@@ -65,7 +65,7 @@ def estimateLanguagesPercentage(languages): ...@@ -65,7 +65,7 @@ def estimateLanguagesPercentage(languages):
for l in languages: for l in languages:
total += int(languages[l]) total += int(languages[l])
for l in languages: for l in languages:
tmp = int(languages[l]) / total * 100 tmp = round(int(languages[l]) / total * 100, 1)
if tmp > max: if tmp > max:
max = tmp max = tmp
principal = l principal = l
...@@ -167,15 +167,20 @@ def correctedSequence(text, last): ...@@ -167,15 +167,20 @@ def correctedSequence(text, last):
def getInfo(): def getInfo():
return st.session_state.author, st.session_state.title, st.session_state.watermark title = st.session_state.title
if title == "":
title = st.session_state.fileName.replace(".pdf", "")
return st.session_state.author, title, st.session_state.watermark
def txtToTSV(fileName, fileAddress, pdfDir): def txtToTSV(fileName, fileAddress, pdfDir):
st.session_state.page = 1 st.session_state.page = 1
author, title, watermark = getInfo() author, title, watermark = getInfo()
tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n" tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n"
tsv, languages = segmentAbstract(fileName, fileAddress, tsv, author, title, with st.spinner(st.session_state.general_text_dict['loading']):
str(date.today().year), "1", "1", watermark) tsv, languages = segmentAbstract(fileName, fileAddress, tsv, author, title,
str(date.today().year), "1", "1", watermark)
if '/' in fileName: if '/' in fileName:
fileName = fileName.split('/')[1] fileName = fileName.split('/')[1]
with open(pdfDir + "/" + fileName.replace(".pdf", ".tsv"), "w", encoding="utf-8-sig") as file: with open(pdfDir + "/" + fileName.replace(".pdf", ".tsv"), "w", encoding="utf-8-sig") as file:
...@@ -224,6 +229,7 @@ def upPage(): ...@@ -224,6 +229,7 @@ def upPage():
def uploadZip(): def uploadZip():
st.write(st.session_state.general_text_dict['text']) st.write(st.session_state.general_text_dict['text'])
st.write(st.session_state.general_text_dict['text2'])
return st.file_uploader( return st.file_uploader(
st.session_state.general_text_dict['file'], type=["zip"], key='file') st.session_state.general_text_dict['file'], type=["zip"], key='file')
...@@ -231,6 +237,7 @@ def uploadZip(): ...@@ -231,6 +237,7 @@ def uploadZip():
def askPDF(fileName): def askPDF(fileName):
with st.form("Submit"): with st.form("Submit"):
st.write(fileName) st.write(fileName)
st.write(st.session_state.general_text_dict['text3'])
col1, col2 = st.columns(2) col1, col2 = st.columns(2)
st.session_state.author = "" st.session_state.author = ""
st.session_state.title = "" st.session_state.title = ""
...@@ -279,6 +286,7 @@ if st.session_state.page == 2: ...@@ -279,6 +286,7 @@ if st.session_state.page == 2:
if st.session_state.page == 1: if st.session_state.page == 1:
fileName = os.listdir(st.session_state.zipDir.name)[st.session_state.nbDoc] fileName = os.listdir(st.session_state.zipDir.name)[st.session_state.nbDoc]
st.session_state.fileName = fileName
if '/' in fileName: if '/' in fileName:
fileName = fileName.split('/')[1] fileName = fileName.split('/')[1]
askPDF(fileName) askPDF(fileName)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment