Skip to contents

Introduction

This vignette compares annotating CTRP-provided treatment ids to PubChem CIDs and CTD information.

Whereas the PubChem CID is a unique identifier for a compound, the PubChem API does not easily map treatment names to CIDs, atleast not in a way that easy for commonly misnamed treatments. Specifically, for the CTRP treatment names (n=545), the PubChem API does not correctly map all of them to PubChem CIDs.

It is an investigation to see which of the methods might map more compounds

library(AnnotationGx)

data(CTRP_treatmentMetadata)

# get a random row from the CTRP_treatmentMetadata

treatment <- CTRP_treatmentMetadata[1, CTRP.treatmentid]
sprintf("CTRP treatment id : %s", treatment)
#> [1] "CTRP treatment id : CIL55"

# map the treatment to a CID using PubChem
mapCompound2CID(treatment)
#>      name    cids
#>    <char>   <int>
#> 1:  CIL55 6623618

Annotating using PubChem

(compounds_to_cids <- 
  CTRP_treatmentMetadata[1:10, 
    AnnotationGx::mapCompound2CID(
        names =  CTRP.treatmentid,
        first = TRUE
        )
      ]
)
failed <- 
  attributes(compounds_to_cids)$failed |> 
    names()
failed <- unique(CTRP_treatmentMetadata[CTRP.treatmentid %in% failed, ])

failed[, CTRP.treatmentid_CLEANED := cleanCharacterStrings(CTRP.treatmentid)]

(failed_to_cids <-
  failed[, 
    AnnotationGx::mapCompound2CID(
      names = CTRP.treatmentid_CLEANED,
      first = TRUE
    )
  ]
)
failed_again <-
  attributes(failed_to_cids)$failed |> 
    names()
failed_dt <- merge(failed_to_cids[!is.na(cids),], failed, by.x = "name", by.y = "CTRP.treatmentid_CLEANED", all.x = F)
failed_dt$name <- NULL

successful_dt <- merge(CTRP_treatmentMetadata, compounds_to_cids[!is.na(cids),],by.x = "CTRP.treatmentid", by.y = "name",  all.x = F)

mapped_PubChem <- data.table::rbindlist(list(successful_dt, failed_dt), use.names = T, fill = T)