Commit cef0870c authored by Loïc Chapron's avatar Loïc Chapron

pubmedToGarganText

parent 485bc840
# pubmedCorpusToTSV
## About The project
pubmedCorpusToTSV transform a text file from PubMed into TSV file usable in GarganText
## Usage
```shell
python3 pubmedCorpusToTSV.py corpus.txt
```
corpus.txt -> Text file from PubMed
Output a TSV legacy file next to the text file : corpus.csv
## Date
This script have been last updated the 2023/07/25.
It can be outdated if the futur.
## Note
Every nbib file also work with this script
\ No newline at end of file
#######
# pubmedCorpusToCsv.py
# description : turn a pubmed file (nbib) into a gargantext csv corpus
# licence : AGPL + CECILL v3
# author : quentin lobbé - qlobbe@iscpif.fr
#######
import sys
import csv
import nbib
import re
import calendar
# python3 pubmedCorpusToCsv.py corpus.txt
path = ""
try :
path = sys.argv[1]
except :
print ("! args error\n")
sys.exit(0)
def normalizePath(path) :
splited = path.split('/')
name = (splited[-1]).split('.')[0]
root = '/'.join(splited[:-1])
return (root,name)
root,name = normalizePath(path)
if root != '':
root += '/'
output = open(root + name + ".csv", "w")
header = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
output.write(header)
docs = nbib.read_file(path)
for doc in docs :
keys = doc.keys()
if len(list(set(['title','publication_date','authors']) & set(keys))) < 3 :
continue
if 'journal' in keys :
source = doc['journal']
else :
source = ""
if 'abstract' in keys :
abstract = doc['abstract']
else :
abstract = ""
title = doc['title']
date = doc['publication_date'].split(' ')
year = date[0]
if len(date) > 1:
try:
month = list(calendar.month_abbr).index(date[1])
except Exception as e:
month = '1'
else:
month = '1'
if len(date) > 2:
day = date[2]
else:
day = '1'
abstract = re.sub('\"', "", abstract).replace("\t", "")
title = re.sub('\"', "", title).replace("\t", "")
authors_lst = []
for author in doc['authors'] :
authors_lst.append((author['author']).replace(',',''))
authors = ','.join(authors_lst)
row = str(title) + "\t" + "scopus" + "\t" + year + "\t" + str(month) + "\t" + str(day) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
output.write(row)
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment