Description

This R project containt some script to format the database of clinical trials on covid-19 in Gargantext readable files (see http://Gargarntext.org).

The database should be in the tsv format (separator = tabulation ; no delimiters) and be formated in UTF8.

Load of Data

First define what is the name of the file to be processed. This file should be in the folder /data

library(lubridate)
source("coronalib.R") # R libraries 
name<-"Database 210120_cleaned" # name of the csv to be loaded

AllData <-read.csv(paste("data/",name,".csv",sep=""),head=TRUE,sep="\t")
AllData$Inclusion.criteria <- NULL
AllData$Exclusion.criteria <- NULL
#AllData <- filter(AllData,!is.na(AllData$Registration.date))
nrow(AllData)
[1] 8836
x <- unique(AllData$Trial.registration.number)
#head(x)[[1]]
head(AllData)
NA

Data segmentation

Several dataframe are generated according to which are the CTs under study.

library(dplyr) 
library(stringr) 
Prevention<- filter(AllData,grepl("Prevention",AllData$Study.aim)) # CTs taggés prevention
Treatments <- filter(AllData,grepl("Treatment",AllData$Study.aim)) # # CTs taggés Treaments
Posttreatment<- filter(AllData,grepl("Post treatment",AllData$Study.aim)) # CTs taggé Post-Treatment
print(paste(count(Prevention)," Prevention arms,", count(Treatments)," Treatments arms and ",count(Posttreatment)," Post-treatment arms."))
[1] "1244  Prevention arms, 4551  Treatments arms and  0  Post-treatment arms."

Export of data and viz

Data are exported in several formats. The list of all treatments is also exported assuming that treatments are separated by a ‘+’ signe in the column treatment of the original db.

source("coronalib.R")
library(reshape)
library(wordcloud)
library(ggplot2)

## Exporting for phylo
garg_export_for_phylo(AllData,"AllData")
garg_export_for_phylo_types_only(AllData,"AllData")

# Html format
# export of a corpus with treatments and outcomes
garg_export_with_html(Treatments,"Treatment") # exporte Treatmeant et Outcomes des essais cliniques de type Treatment
garg_export_with_html(AllData,"AllData") # exporte Treatmeant et Outcomes des essais cliniques de tous types
garg_export_treatments_with_html(AllData,"AllData") # exporte Treatmeant des essais cliniques de tous types


# export of the list of all types of treatments whatever the phase in the format Gargantext map list Gargantext V3 & V4
gargV4_export_treaments_list(AllData,"AllDb")
gargV3_export_treaments_list(AllData,"AllDb")
gargV3_export_treamentsTypes_list(AllData,"AllDb")


# Conversion of the tsv file into Gargantext readable tsv dile
# Seleciton of the kind of CT to export : All / Prevention / Treatment / Post-treatment
# Selection of the kind of informations to include in the main text to be processed by Gargantext (bastract column): Treatmeant and/or Outcomes


# simple txt export
garg_export_all_plain(Treatments,"Treatment") # export main information in plain text
garg_export_OnlyTreatments(Treatments,"Treatment") # export only treatments in plain text
garg_export_OnlyOutcomes(Treatments,"Treatment") # export only outcomes in plain text
garg_export_all_plain(AllData,"All") # export main information in plain text

# raw export (just to have specific maps)
garg_export_raw_treatments(AllData,"AllData") ## export only info relative to treatments without any formating.

# Some simple viz - Tag cloud of the treaments per category of CT
TreatmentsCloud(Treatments)

TreatmentsCloud(Prevention)

TreatmentsCloud(Posttreatment)
Factor `treatments` contains implicit NA, consider using `forcats::fct_explicit_na`Error in strwidth(words[i], cex = size[i], ...) : valeur 'cex' incorrecte

LS0tCnRpdGxlOiAiQ29yb25hdmlydXMgQ2xpbmljYWwgVHJpYWxzIFIgbGlicmFyeSIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQojIyMgRGVzY3JpcHRpb24KClRoaXMgUiBwcm9qZWN0IGNvbnRhaW50IHNvbWUgc2NyaXB0IHRvIGZvcm1hdCB0aGUgZGF0YWJhc2Ugb2YgY2xpbmljYWwgdHJpYWxzIG9uIGNvdmlkLTE5IGluIEdhcmdhbnRleHQgcmVhZGFibGUgZmlsZXMgKHNlZSBodHRwOi8vR2FyZ2FybnRleHQub3JnKS4KClRoZSBkYXRhYmFzZSBzaG91bGQgYmUgaW4gdGhlIHRzdiBmb3JtYXQgKHNlcGFyYXRvciA9IHRhYnVsYXRpb24gOyBubyBkZWxpbWl0ZXJzKSBhbmQgYmUgZm9ybWF0ZWQgaW4gVVRGOC4KCiMjIyBMb2FkIG9mIERhdGEKRmlyc3QgZGVmaW5lIHdoYXQgaXMgdGhlIG5hbWUgb2YgdGhlIGZpbGUgdG8gYmUgcHJvY2Vzc2VkLiBUaGlzIGZpbGUgc2hvdWxkIGJlIGluIHRoZSBmb2xkZXIgL2RhdGEKYGBge3J9CmxpYnJhcnkobHVicmlkYXRlKQpzb3VyY2UoImNvcm9uYWxpYi5SIikgIyBSIGxpYnJhcmllcyAKbmFtZTwtIkRhdGFiYXNlIDIxMDEyMF9jbGVhbmVkIiAjIG5hbWUgb2YgdGhlIGNzdiB0byBiZSBsb2FkZWQKCkFsbERhdGEgPC1yZWFkLmNzdihwYXN0ZSgiZGF0YS8iLG5hbWUsIi5jc3YiLHNlcD0iIiksaGVhZD1UUlVFLHNlcD0iXHQiKQpBbGxEYXRhJEluY2x1c2lvbi5jcml0ZXJpYSA8LSBOVUxMCkFsbERhdGEkRXhjbHVzaW9uLmNyaXRlcmlhIDwtIE5VTEwKI0FsbERhdGEgPC0gZmlsdGVyKEFsbERhdGEsIWlzLm5hKEFsbERhdGEkUmVnaXN0cmF0aW9uLmRhdGUpKQpucm93KEFsbERhdGEpCnggPC0gdW5pcXVlKEFsbERhdGEkVHJpYWwucmVnaXN0cmF0aW9uLm51bWJlcikKI2hlYWQoeClbWzFdXQpoZWFkKEFsbERhdGEpCgpgYGAKIyMjIERhdGEgc2VnbWVudGF0aW9uClNldmVyYWwgZGF0YWZyYW1lIGFyZSBnZW5lcmF0ZWQgYWNjb3JkaW5nIHRvIHdoaWNoIGFyZSB0aGUgQ1RzIHVuZGVyIHN0dWR5LgoKYGBge3J9CmxpYnJhcnkoZHBseXIpIApsaWJyYXJ5KHN0cmluZ3IpIApQcmV2ZW50aW9uPC0gZmlsdGVyKEFsbERhdGEsZ3JlcGwoIlByZXZlbnRpb24iLEFsbERhdGEkU3R1ZHkuYWltKSkgIyBDVHMgdGFnZ8OpcyBwcmV2ZW50aW9uClRyZWF0bWVudHMgPC0gZmlsdGVyKEFsbERhdGEsZ3JlcGwoIlRyZWF0bWVudCIsQWxsRGF0YSRTdHVkeS5haW0pKSAjwqAjIENUcyB0YWdnw6lzIFRyZWFtZW50cwpQb3N0dHJlYXRtZW50PC0gZmlsdGVyKEFsbERhdGEsZ3JlcGwoIlBvc3QgdHJlYXRtZW50IixBbGxEYXRhJFN0dWR5LmFpbSkpICMgQ1RzIHRhZ2fDqSBQb3N0LVRyZWF0bWVudApwcmludChwYXN0ZShjb3VudChQcmV2ZW50aW9uKSwiIFByZXZlbnRpb24gYXJtcywiLCBjb3VudChUcmVhdG1lbnRzKSwiIFRyZWF0bWVudHMgYXJtcyBhbmQgIixjb3VudChQb3N0dHJlYXRtZW50KSwiIFBvc3QtdHJlYXRtZW50IGFybXMuIikpCmBgYAojIyBFeHBvcnQgb2YgZGF0YSBhbmQgdml6CkRhdGEgYXJlIGV4cG9ydGVkIGluIHNldmVyYWwgZm9ybWF0cy4gVGhlIGxpc3Qgb2YgYWxsIHRyZWF0bWVudHMgaXMgYWxzbyBleHBvcnRlZCBhc3N1bWluZyB0aGF0IHRyZWF0bWVudHMgYXJlIHNlcGFyYXRlZCBieSBhICcrJyBzaWduZSBpbiB0aGUgY29sdW1uIHRyZWF0bWVudCBvZiB0aGUgb3JpZ2luYWwgZGIuCgpgYGB7cn0Kc291cmNlKCJjb3JvbmFsaWIuUiIpCmxpYnJhcnkocmVzaGFwZSkKbGlicmFyeSh3b3JkY2xvdWQpCmxpYnJhcnkoZ2dwbG90MikKCiMjIEV4cG9ydGluZyBmb3IgcGh5bG8KZ2FyZ19leHBvcnRfZm9yX3BoeWxvKEFsbERhdGEsIkFsbERhdGEiKQpnYXJnX2V4cG9ydF9mb3JfcGh5bG9fdHlwZXNfb25seShBbGxEYXRhLCJBbGxEYXRhIikKCiMgSHRtbCBmb3JtYXQKIyBleHBvcnQgb2YgYSBjb3JwdXMgd2l0aCB0cmVhdG1lbnRzIGFuZCBvdXRjb21lcwpnYXJnX2V4cG9ydF93aXRoX2h0bWwoVHJlYXRtZW50cywiVHJlYXRtZW50IikgIyBleHBvcnRlIFRyZWF0bWVhbnQgZXQgT3V0Y29tZXMgZGVzIGVzc2FpcyBjbGluaXF1ZXMgZGUgdHlwZSBUcmVhdG1lbnQKZ2FyZ19leHBvcnRfd2l0aF9odG1sKEFsbERhdGEsIkFsbERhdGEiKSAjIGV4cG9ydGUgVHJlYXRtZWFudCBldCBPdXRjb21lcyBkZXMgZXNzYWlzIGNsaW5pcXVlcyBkZSB0b3VzIHR5cGVzCmdhcmdfZXhwb3J0X3RyZWF0bWVudHNfd2l0aF9odG1sKEFsbERhdGEsIkFsbERhdGEiKSAjIGV4cG9ydGUgVHJlYXRtZWFudCBkZXMgZXNzYWlzIGNsaW5pcXVlcyBkZSB0b3VzIHR5cGVzCgoKIyBleHBvcnQgb2YgdGhlIGxpc3Qgb2YgYWxsIHR5cGVzIG9mIHRyZWF0bWVudHMgd2hhdGV2ZXIgdGhlIHBoYXNlIGluIHRoZSBmb3JtYXQgR2FyZ2FudGV4dCBtYXAgbGlzdCBHYXJnYW50ZXh0IFYzICYgVjQKZ2FyZ1Y0X2V4cG9ydF90cmVhbWVudHNfbGlzdChBbGxEYXRhLCJBbGxEYiIpCmdhcmdWM19leHBvcnRfdHJlYW1lbnRzX2xpc3QoQWxsRGF0YSwiQWxsRGIiKQpnYXJnVjNfZXhwb3J0X3RyZWFtZW50c1R5cGVzX2xpc3QoQWxsRGF0YSwiQWxsRGIiKQoKCiMgQ29udmVyc2lvbiBvZiB0aGUgdHN2IGZpbGUgaW50byBHYXJnYW50ZXh0IHJlYWRhYmxlIHRzdiBkaWxlCiMgU2VsZWNpdG9uIG9mIHRoZSBraW5kIG9mIENUIHRvIGV4cG9ydCA6IEFsbCAvIFByZXZlbnRpb24gLyBUcmVhdG1lbnQgLyBQb3N0LXRyZWF0bWVudAojwqBTZWxlY3Rpb24gb2YgdGhlIGtpbmQgb2YgaW5mb3JtYXRpb25zIHRvIGluY2x1ZGUgaW4gdGhlIG1haW4gdGV4dCB0byBiZSBwcm9jZXNzZWQgYnkgR2FyZ2FudGV4dCAoYmFzdHJhY3QgY29sdW1uKTogVHJlYXRtZWFudCBhbmQvb3IgT3V0Y29tZXMKCgojwqBzaW1wbGUgdHh0IGV4cG9ydApnYXJnX2V4cG9ydF9hbGxfcGxhaW4oVHJlYXRtZW50cywiVHJlYXRtZW50IikgIyBleHBvcnQgbWFpbiBpbmZvcm1hdGlvbiBpbiBwbGFpbiB0ZXh0CmdhcmdfZXhwb3J0X09ubHlUcmVhdG1lbnRzKFRyZWF0bWVudHMsIlRyZWF0bWVudCIpICMgZXhwb3J0IG9ubHkgdHJlYXRtZW50cyBpbiBwbGFpbiB0ZXh0CmdhcmdfZXhwb3J0X09ubHlPdXRjb21lcyhUcmVhdG1lbnRzLCJUcmVhdG1lbnQiKSAjIGV4cG9ydCBvbmx5IG91dGNvbWVzIGluIHBsYWluIHRleHQKZ2FyZ19leHBvcnRfYWxsX3BsYWluKEFsbERhdGEsIkFsbCIpICMgZXhwb3J0IG1haW4gaW5mb3JtYXRpb24gaW4gcGxhaW4gdGV4dAoKIyByYXcgZXhwb3J0IChqdXN0IHRvIGhhdmUgc3BlY2lmaWMgbWFwcykKZ2FyZ19leHBvcnRfcmF3X3RyZWF0bWVudHMoQWxsRGF0YSwiQWxsRGF0YSIpICMjIGV4cG9ydCBvbmx5IGluZm8gcmVsYXRpdmUgdG8gdHJlYXRtZW50cyB3aXRob3V0IGFueSBmb3JtYXRpbmcuCgojIFNvbWUgc2ltcGxlIHZpeiAtIFRhZyBjbG91ZCBvZiB0aGUgdHJlYW1lbnRzIHBlciBjYXRlZ29yeSBvZiBDVApUcmVhdG1lbnRzQ2xvdWQoVHJlYXRtZW50cykKVHJlYXRtZW50c0Nsb3VkKFByZXZlbnRpb24pClRyZWF0bWVudHNDbG91ZChQb3N0dHJlYXRtZW50KQoKYGBgCmBgYHtyfQogIGRmIDwtIEFsbERhdGEKICB4IDwtIGRhdGEuZnJhbWUoInB1YmxpY2F0aW9uX2RheSI9ZGF5KGFzLkRhdGUoZGYkUmVnaXN0cmF0aW9uLmRhdGUsIiVZLSVtLSVkIikpKQogIHgkcHVibGljYXRpb25fbW9udGggPSBtb250aChhcy5EYXRlKGRmJFJlZ2lzdHJhdGlvbi5kYXRlLCIlWS0lbS0lZCIpKQogIHgkcHVibGljYXRpb25feWVhciA9IGZvcm1hdCgoYXMuRGF0ZShkZiRSZWdpc3RyYXRpb24uZGF0ZSwiJVktJW0tJWQiKSksIiVXIikKICB4JGF1dGhvcnM9ZGYkRmlyc3QuYXV0aG9yCiAgeCR0aXRsZT1kZiRUcmlhbC5yZWdpc3RyYXRpb24ubnVtYmVyCiAgeCRzb3VyY2U9ZGYkRnVuZGluZwogIHgkYWJzdHJhY3Q9cGFzdGUodG91cHBlcihkZiRQaGFybWFjb2xvZ2ljYWwudHJlYXRtZW50KSwiIC4gIixzdHJfcmVwbGFjZV9hbGwodG9sb3dlcihkZiRUcmVhdG1lbnQudHlwZSksIltbK11dIiwiIDsgIiksIiAuICIsc3RyX3JlcGxhY2VfYWxsKHRvbG93ZXIoZGYkVHJlYXRtZW50Lm5hbWUpLCJbWytdXSIsIiA7ICIpKQogIHkgPC0gZmlsdGVyKHgseCRwdWJsaWNhdGlvbl95ZWFyIT0iTkEiKQogIHdyaXRlLnRhYmxlKHksIGZpbGUgPSBwYXN0ZSgib3V0cHV0L0NUcGh5bG8iLGZpbGVuYW1lLCIuY3N2IixzZXAgPSAiIiksIHNlcCA9ICJcdCIscm93Lm5hbWVzID0gRkFMU0UpCgpgYGAKCgoK