pubmedToGarganText

cef0870c · Loïc Chapron · 485bc840 · cef0870c · cef0870c · cef0870c
Commit cef0870c authored Jul 25, 2023 by Loïc Chapron
4 changed files
--- a/Conversion/ToTSV/pubmedCorpusToTSV/README.md
+++ b/Conversion/ToTSV/pubmedCorpusToTSV/README.md
+# pubmedCorpusToTSV
+## About The project
+pubmedCorpusToTSV transform a text file from PubMed into TSV file usable in GarganText
+## Usage
+```shell
+python3 pubmedCorpusToTSV.py corpus.txt
+```
+corpus.txt -> Text file from PubMed
+Output a TSV legacy file next to the text file : corpus.csv
+## Date
+This script have been last updated the 2023/07/25.
+It can be outdated if the futur.
+## Note
+Every nbib file also work with this script
\ No newline at end of file
--- a/Conversion/ToTSV/pubmedCorpusToTSV/pubmedCorpusToTsv.py
+++ b/Conversion/ToTSV/pubmedCorpusToTSV/pubmedCorpusToTsv.py
+#######
+# pubmedCorpusToCsv.py
+# description : turn a pubmed file (nbib) into a gargantext csv corpus
+# licence : AGPL + CECILL v3
+# author : quentin lobbé - qlobbe@iscpif.fr
+#######
+import sys
+import csv
+import nbib
+import re
+import calendar
+# python3 pubmedCorpusToCsv.py corpus.txt
+path = ""
+try :
+    path = sys.argv[1]
+except :
+    print ("! args error\n")
+    sys.exit(0)
+def normalizePath(path) :
+	splited = path.split('/')
+	name = (splited[-1]).split('.')[0]
+	root = '/'.join(splited[:-1])
+	return (root,name)
+root,name = normalizePath(path)
+if root != '':
+	root += '/'
+output = open(root + name + ".csv", "w")
+header = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
+output.write(header)
+docs = nbib.read_file(path)
+for doc in docs :
+	keys = doc.keys()
+	if len(list(set(['title','publication_date','authors']) & set(keys))) < 3 :
+		continue
+	if 'journal' in keys :
+		source = doc['journal']
+	else :
+		source = ""	
+	if 'abstract' in keys :
+		abstract = doc['abstract']
+	else :
+		abstract = ""
+	title = doc['title']
+	date = doc['publication_date'].split(' ')
+	year = date[0]
+	if len(date) > 1:
+		try:
+			month = list(calendar.month_abbr).index(date[1])
+		except Exception as e:
+			month = '1'
+	else:
+		month = '1'
+	if len(date) > 2:
+		day = date[2]
+	else:
+		day = '1'
+	abstract = re.sub('\"', "", abstract).replace("\t", "")
+	title = re.sub('\"', "", title).replace("\t", "")
+	authors_lst = []
+	for author in doc['authors'] :
+		authors_lst.append((author['author']).replace(',',''))
+	authors = ','.join(authors_lst)
+	row = str(title) + "\t" + "scopus" + "\t" + year + "\t" + str(month) + "\t" + str(day) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
+	output.write(row)
--- a/Conversion/ToTSV/pubmedCorpusToTSV/sample/pubmed-Biologie-set+abstract.txt
+++ b/Conversion/ToTSV/pubmedCorpusToTSV/sample/pubmed-Biologie-set+abstract.txt
--- a/Conversion/ToTSV/pubmedCorpusToTSV/sample/pubmed-Biologie-set.txt
+++ b/Conversion/ToTSV/pubmedCorpusToTSV/sample/pubmed-Biologie-set.txt