R scripts added in /Rscripts

f16f9681 · david Chavalarias · 5c8b59fa · f16f9681 · f16f9681 · f16f9681
Commit f16f9681 authored May 11, 2020 by david Chavalarias
9 changed files
--- a/Rscripts/Process_CT_for_Gargantext.Rmd
+++ b/Rscripts/Process_CT_for_Gargantext.Rmd
+---
+title: "Coronavirus Clinical Trials R library"
+output: html_notebook
+---
+### Description
+
+This R project containt some script to format the database of clinical trials on covid-19 in Gargantext readable files (see http://Gargarntext.org).
+
+The database should be in the tsv format (separator = tabulation ; no delimiters) and be formated in UTF8.
+
+### Load of Data
+First define what is the name of the file to be processed. This file should be in the folder /data
+```{r}
+library(lubridate)
+source("coronalib.R") # R libraries 
+name<-"Database060520" # name of the csv to be loaded
+
+AllData <-read.csv(paste("data/",name,".csv",sep=""),head=TRUE,sep="\t")
+AllData$Inclusion.criteria <- NULL
+AllData$Exclusion.criteria <- NULL
+#AllData <- filter(AllDataTemp,AllDataTemp$First.author!="N/A")
+nrow(AllData)
+x <- unique(AllData$Trial.registration.number)
+head(x)[[1]]
+```
+### Data segmentation
+Several dataframe are generated according to which are the CTs under study.
+
+```{r}
+library(dplyr) 
+library(stringr) 
+Prevention<- filter(AllData,grepl("Prevention",AllData$Study.aim)) # CTs taggés prevention
+Treatments <- filter(AllData,grepl("Treatment",AllData$Study.aim)) # # CTs taggés Treaments
+Posttreatment<- filter(AllData,grepl("Post treatment",AllData$Study.aim)) # CTs taggé Post-Treatment
+print(paste(count(Prevention)," Prevention CTs,", count(Treatments)," Treatments CTs and ",count(Posttreatment)," Post-treatment CTs."))
+```
+## Export of data and viz
+Data are exported in several formats. The list of all treatments is also exported assuming that treatments are separated by a '+' signe in the column treatment of the original db.
+
+```{r}
+source("coronalib.R")
+library(reshape)
+library(wordcloud)
+library(ggplot2)
+
+# Html format
+# export of a corpus with treatments and outcomes
+garg_export_with_html(Treatments,"Treatment") # exporte Treatmeant et Outcomes des essais cliniques de type Treatment
+garg_export_with_html(AllData,"AllData") # exporte Treatmeant et Outcomes des essais cliniques de tous types
+garg_export_treatments_with_html(AllData,"AllData") # exporte Treatmeant des essais cliniques de tous types
+
+
+# export of the list of all types of treatments whatever the phase in the format Gargantext map list Gargantext V3 & V4
+gargV4_export_treaments_list(AllData,"AllDb")
+gargV3_export_treaments_list(AllData,"AllDb")
+
+
+# Conversion of the tsv file into Gargantext readable tsv dile
+# Seleciton of the kind of CT to export : All / Prevention / Treatment / Post-treatment
+# Selection of the kind of informations to include in the main text to be processed by Gargantext (bastract column): Treatmeant and/or Outcomes
+
+
+# simple txt export
+garg_export_all_plain(Treatments,"Treatment") # export main information in plain text
+garg_export_OnlyTreatments(Treatments,"Treatment") # export only treatments in plain text
+garg_export_OnlyOutcomes(Treatments,"Treatment") # export only outcomes in plain text
+garg_export_all_plain(AllData,"All") # export main information in plain text
+
+# raw export (just to have specific maps)
+garg_export_raw_treatments(AllData,"AllData") ## export only info relative to treatments without any formating.
+
+# Some simple viz - Tag cloud of the treaments per category of CT
+TreatmentsCloud(Treatments)
+TreatmentsCloud(Prevention)
+TreatmentsCloud(Posttreatment)
+
+```
+
+
--- a/Rscripts/Process_CT_for_Gargantext.nb.html
+++ b/Rscripts/Process_CT_for_Gargantext.nb.html
--- a/Rscripts/README.md
+++ b/Rscripts/README.md
+# process_clinical_trials
+
+Some R script to process convert clinical trials tsv file into Gargantext readable file + other small script.
+
+The main file to run is Process_CT_for_Gargantext.Rmd 
+
+Make shure that you have placed your database in the /Rscripts/data folder in the tsv format (separator = tabulation ; no delimiters) formated in UTF8.
+
+When Process_CT_for_Gargantext.Rmd is ran, a new csv file is writen at the right place in /data to update the clinical trials descriptions. 
--- a/Rscripts/coronalib.R
+++ b/Rscripts/coronalib.R
+# Export Functions
+
+## Gargantext V3
+
+### abstract as html file as in template merging treatments and outcome
+garg_export_with_html <- function(df,filename) {
+  x <- data.frame("publication_day"=day(as.Date(df$Registration.date,"%Y-%m-%d")))
+  x$publication_month = month(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$publication_year = year(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$authors=df$First.author
+  x$title=df$Trial.registration.number
+  x$source=df$Funding
+  x$abstract=paste("<a href='",df$Full.text.link,"' target='blank'>Link to study</a></br></br><b> ",df$Study.design," (</b>",df$Recruitment.status,")</br></br><b>TYPE OF PATIENTS:</b> ",df$Type.of.patients,"</br></br><b>",toupper(df$Pharmacological.treatment),"</b></br><i>Type: </i>",df$Treatment.type,"</br><i> Treatment name: </i>",str_replace_all(tolower(df$Treatment.name),"[[+]]"," + "),"</br></br><b>PRIMARY OUTCOME</b></br>",df$primary.outcome,"</br></br><b>",df$Center,"</b> (",df$Countries,")</br><b>N in this treatment group:</b> ",df$n.randomized.in.this.arm," </br><b>N in the trial: </b>",df$Total.sample.size)
+
+  write.table(x, file = paste("../data/gargCSV_all_html_",filename,".csv",sep = ""), sep = "\t",row.names = FALSE)
+}
+
+garg_export_treatments_with_html <- function(df,filename) {
+  x <- data.frame("publication_day"=day(as.Date(df$Registration.date,"%Y-%m-%d")))
+  x$publication_month = month(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$publication_year = year(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$authors=df$First.author
+  x$title=df$Trial.registration.number
+  x$source=df$Funding
+  x$abstract=paste("<a href='",df$Full.text.link,"' target='blank'>Link to study</a></br></br><b> ",toupper(df$Pharmacological.treatment),"</b></br><i>Type: </i>",df$Treatment.type,"</br><i> Treatment name: </i>",str_replace_all(tolower(df$Treatment.name),"[[+]]"," + "))
+  write.table(x, file = paste("output/gargCSV_Treatments_html_",filename,".csv",sep = ""), sep = "\t",row.names = FALSE)
+}
+
+garg_export_raw_treatments<- function(df,filename) {
+  x <- data.frame("publication_day"=day(as.Date(df$Registration.date,"%Y-%m-%d")))
+  x$publication_month = month(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$publication_year = year(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$authors=df$First.author
+  x$title=df$Trial.registration.number
+  x$source=df$Funding
+  x$abstract=paste(toupper(df$Pharmacological.treatment)," . ",df$Treatment.type," . ",str_replace_all(tolower(df$Treatment.name),"[[+]]"," ; "))
+  write.table(x, file = paste("output/gargCSV_Treatments_raw_",filename,".csv",sep = ""), sep = "\t",row.names = FALSE)
+}
+
+
+garg_export_all_plain <- function(df,filename) {
+  x <- data.frame("publication_day"=day(as.Date(df$Registration.date,"%Y-%m-%d")))
+  x$publication_month = month(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$publication_year = year(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$authors=df$First.author
+  x$title=df$Trial.registration.number
+  x$source=df$Funding
+  x$abstract=paste("Link to study: ",df$Full.text.link," DESIGN: ",df$Study.design,", RECRUITEMENT STATUS: ",df$Recruitment.status,", TYPE OF PATIENTS: ",df$Type.of.patients," TREATMENT: ",toupper(df$Pharmacological.treatment),", TYPE: ",df$Treatment.type,", TREATMENT NAME: ",str_replace_all(tolower(df$Treatment.name),"[[+]]"," + "),", PRIMARY OUTCOME: ",df$primary.outcome,", CENTER: ",df$Center,", COUNTRY: ",df$Countries,", N in this treatment group: ",df$n.randomized.in.this.arm,", N in the trial: ",df$Total.sample.size)
+  
+  write.table(x, file = paste("output/gargCSV_all_plain_",filename,".csv",sep = ""), sep = "\t",row.names = FALSE)
+}
+
+garg_export_OnlyTreatments <- function(df,filename) {
+  x <- data.frame("publication_day"=day(as.Date(df$Registration.date,"%Y-%m-%d")))
+  x$publication_month = month(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$publication_year = year(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$authors=df$Full.text.link
+  x$title=df$Trial.registration.number
+  x$source=df$Funding
+  x$abstract=paste(" TREATMENT: ",toupper(df$Pharmacological.treatment),", TYPE: ",df$Treatment.type,", TREATMENT NAME: ",str_replace_all(tolower(df$Treatment.name),"[[+]]"," + "))
+  
+  write.table(x, file = paste("output/gargCSV_only_treatment_plain",filename,".csv",sep = ""), sep = "\t",row.names = FALSE)
+}
+
+garg_export_OnlyOutcomes <- function(df,filename) {
+  x <- data.frame("publication_day"=day(as.Date(df$Registration.date,"%Y-%m-%d")))
+  x$publication_month = month(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$publication_year = year(as.Date(df$Registration.date,"%Y-%m-%d"))
+  x$authors=df$Full.text.link
+  x$title=df$Trial.registration.number
+  x$source=df$Funding
+  x$abstract=paste("PRIMARY OUTCOME: ",df$primary.outcome)
+  write.table(x, file = paste("output/gargCSV_only_outcomes_plain",filename,".csv",sep = ""), sep = "\t",row.names = FALSE)
+}
+
+# Export de listes de termes
+
+getTreatmentsList <- function(df){
+## fait juste la liste des treatments en partant du fait qu'ils sont séparé par des + dans la colonne treatments  
+  y<-paste(df$Treatment.name,collapse = "+")
+  z<-data.frame("treatments"=strsplit(str_replace_all(tolower(y),"[[+]]","&"),"&")[[1]])
+}
+
+gargV3_export_treaments_list <- function(df,filename) {
+## Export de la liste de tous les traitements pour import dans gargantextV3
+y<-paste(AllData$Treatment.name,collapse = "+")
+z<-data.frame("label"=unique(strsplit(str_replace_all(tolower(y),"[[+]]","&"),"&")[[1]]))
+z$forms=z$label
+write.table(z, file = paste("output/WL_GargV3_clinicaltrialsTreatments.csv",sep = ""), sep = "\t",row.names = FALSE,quote = FALSE)
+}
+
+gargV4_export_treaments_list <- function(df,filename) {
+## Export de la liste de tous les traitements pour import dans gargantextV4 sur la base de traitements séparés par des + dans la bdd originale
+t=getTreatmentsList(df)
+output<-paste('"',t$treatments,'":{"size":1,"list":"GraphTerm","children":[]}')
+output<-paste('{"NgramsTerms":{"version":1237,"data":{',paste(output,collapse =','),"}}}")      
+write.table(output, file = paste("output/WL_GargV4_CT_treaments_",filename,".json",sep = ""), sep = "",row.names = FALSE,quote = FALSE)
+}
+
+ColumnTextCount <- function(df,column){
+  y<-paste(df$Treatment.name,collapse = "+")
+  z<-data.frame("treatments"=strsplit(str_replace_all(tolower(y),"[[+]]","&"),"&")[[1]])
+}
+
+## Visualization functions
+TreatmentsCloud <- function(df){
+  treatments_list<-getTreatmentsList(df)
+  words=treatments_list %>% count(treatments)
+  set.seed(1234)
+  wordcloud(words = words$treatments, freq = words$n, min.freq =2,
+            max.words=400, random.order=FALSE, rot.per=0.35, 
+            colors=brewer.pal(8, "Dark2"))
+}
--- a/Rscripts/coronavirusDataProcessing.Rproj
+++ b/Rscripts/coronavirusDataProcessing.Rproj
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
--- a/Rscripts/data/Database060520.csv
+++ b/Rscripts/data/Database060520.csv
--- a/data/conditionnal/project_conf.json
+++ b/data/conditionnal/project_conf.json
@@ -4,7 +4,7 @@
      "name": "term",
      "reldbs": {
        "csv": {
-          "file": "../clinicaltrials.csv",
+          "file": "../gargCSV_all_html_AllData.csv",
          "qcols":  ["title","abstract"],
          "template": "bib_details"
        }

--- a/data/distributional/project_conf.json
+++ b/data/distributional/project_conf.json
@@ -4,7 +4,7 @@
      "name": "term",
      "reldbs": {
        "csv": {
-          "file": "../clinicaltrials.csv",
+          "file": "../gargCSV_all_html_AllData.csv",
          "qcols":  ["title","abstract"],
          "template": "bib_details"
        }

--- a/data/clinicaltrials.csv
+++ b/data/clinicaltrials.csv