0 Posted 2020-06-14Updated 2024-01-11R / Bio / Anno/Enrich25 minutes read (About 3689 words)

clusterProfiler

Tutorial: yulab

Install

BiocManager::install("clusterProfiler")

Enrichment analysis and GSEA

Enrichment analysis is a computational method used in bioinformatics to determine whether a given set of genes or proteins is enriched for specific functions, pathways, or biological processes. It involves comparing the input gene set to a reference database, such as Gene Ontology, KEGG, or Reactome, and identifying the over-represented terms using statistical methods.

Gene Set Enrichment Analysis (GSEA) is a type of enrichment analysis that determines whether a particular gene set shows statistically significant differences between two biological states. Unlike traditional enrichment analysis, GSEA considers the entire gene set, rather than individual genes, and evaluates whether the gene set is significantly enriched at the top or bottom of a ranked list of genes based on their differential expression or correlation with a phenotype. GSEA is commonly used in transcriptomics and genomics studies to identify pathways or biological processes that are differentially regulated between disease and control samples.
© ChatGPT

Go ontology

GO_1. Supported Organism

For module species which added in OrgDb, we can turn the ID to GO_id;

For other species, you can build your own OrgDb database by following GOSemSim.

If genes are already annotated (in data.frame witch gene ID column followed by GO ID), we can use enricher() and geosGO() function to perform over-representation test.

GO_2. Example for `OrgDb` species

library(clusterProfiler)
library(org.Hs.eg.db)

data(geneList, package="DOSE")
gene <- names(geneList)[abs(geneList) > 2]
gene.df <- bitr(gene, fromType = "ENTREZID",
        toType = c("ENSEMBL", "SYMBOL"),
        OrgDb = org.Hs.eg.db)
head(gene.df)

ENTREZID         ENSEMBL SYMBOL
   4312 ENSG00000196611   MMP1
   8318 ENSG00000093009  CDC45
  10874 ENSG00000109255    NMU
  55143 ENSG00000134690  CDCA8
  55388 ENSG00000065328  MCM10
    991 ENSG00000117399  CDC20

	4312	8318
ACCNUM	AAA35699	NP_001139410
ENTREZID	ENSG00000196611	ENSG00000093009
ENSEMBL SYMBOL	MMP1	CDC45

ggo <- groupGO(gene     = gene,
               OrgDb    = org.Hs.eg.db,
               ont      = "CC",
               level    = 3,
               readable = TRUE)

head(ggo)
barplot(ggo, showCategory = 20)

ID                    Description Count GeneRatio
GO:0005886 GO:0005886                plasma membrane    55    55/207
GO:0005628 GO:0005628              prospore membrane     0     0/207
GO:0005789 GO:0005789 endoplasmic reticulum membrane     8     8/207
...

132

GO_3. Over-Representation Test

ego <- enrichGO(gene          = gene,
                universe      = names(geneList),
                OrgDb         = org.Hs.eg.db,
                ont           = "CC",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05,
        readable      = TRUE)
head(ego)

ego2 <- enrichGO(gene         = gene.df$ENSEMBL,
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENSEMBL',
                ont           = "CC",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05)

## turn ENSEMBL ID to Samble
ego2_r <- setReadable(ego2, OrgDb = org.Hs.eg.db)

dotplot(ego2)

ego	ego2

GO_4 GO Gene Set Enrichment Analysis

ego3 <- gseGO(geneList     = geneList,
              OrgDb        = org.Hs.eg.db,
              ont          = "CC",
              nPerm        = 1000,
              minGSSize    = 100,
              maxGSSize    = 500,
              pvalueCutoff = 0.05,
              verbose      = FALSE)

dotplot(ego3)

ID                              Description setSize enrichmentScore       NES      pvalue   p.adjust    qvalues rank                   leading_edge
GO:0031012 GO:0031012                     extracellular matrix     427      -0.4868578 -2.138854 0.001231527 0.03095063 0.02171974 1797 tags=37%, list=14%, signal=33%
GO:0099568 GO:0099568                       cytoplasmic region     368      -0.3426037 -1.488559 0.001259446 0.03095063 0.02171974 2613 tags=28%, list=21%, signal=23%

GO ontology for Non-module Species

KEGG Enrichment

KE_1. Over-Representation Test

library(clusterProfiler)
search_kegg_organism('ece', by='kegg_code')

ecoli <- search_kegg_organism('Escherichia coli', by='scientific_name')
dim(ecoli)

data(geneList, package="DOSE")
gene <- names(geneList)[abs(geneList) > 2]

kk <- enrichKEGG(gene         = gene,
                 organism     = 'hsa',
                 pvalueCutoff = 0.05)
head(kk)

ID                                                   Description GeneRatio  BgRatio       pvalue     p.adjust       qvalue                                             geneID Count
hsa04110 hsa04110                                                    Cell cycle     11/93 124/8031 1.590786e-07 3.213388e-05 3.148083e-05 8318/991/9133/890/983/4085/7272/1111/891/4174/9232    11
hsa04114 hsa04114                                                Oocyte meiosis     10/93 128/8031 1.960649e-06 1.980256e-04 1.940011e-04    991/9133/983/4085/51806/6790/891/9232/3708/5241    10
hsa04218 hsa04218                                           Cellular senescence     10/93 160/8031 1.450991e-05 9.375595e-04 9.185054e-04     2305/4605/9133/890/983/51806/1111/891/776/3708    10
hsa04061 hsa04061 Viral protein interaction with cytokine and cytokine receptor      8/93 100/8031 1.856553e-05 9.375595e-04 9.185054e-04           3627/10563/6373/4283/6362/6355/9547/1524     8
hsa03320 hsa03320                                        PPAR signaling pathway      7/93  77/8031 2.765640e-05 1.117319e-03 1.094611e-03                 4312/9415/9370/5105/2167/3158/5346     7
hsa04914 hsa04914                       Progesterone-mediated oocyte maturation      7/93  99/8031 1.392778e-04 4.689019e-03 4.593724e-03                    9133/890/983/4085/6790/891/5241     7

KE_2. Gene Set Enrichment Analysis

kk2 <- gseKEGG(geneList     = geneList,
               organism     = 'hsa',
               nPerm        = 1000,
               minGSSize    = 120,
               pvalueCutoff = 0.05,
               verbose      = FALSE)
head(kk2)

ID                 Description setSize enrichmentScore       NES      pvalue  p.adjust    qvalues rank                   leading_edge
hsa04510 hsa04510              Focal adhesion     190      -0.4169068 -1.697372 0.001418440 0.0241065 0.01552815 2183 tags=27%, list=17%, signal=22%
hsa03013 hsa03013               RNA transport     131       0.4116488  1.742019 0.003067485 0.0241065 0.01552815 3383 tags=40%, list=27%, signal=29%
hsa05152 hsa05152                Tuberculosis     162       0.3745153  1.628558 0.003164557 0.0241065 0.01552815 2823 tags=34%, list=23%, signal=27%
hsa04218 hsa04218         Cellular senescence     143       0.4153718  1.761689 0.003225806 0.0241065 0.01552815 1155  tags=17%, list=9%, signal=16%
hsa05203 hsa05203        Viral carcinogenesis     164       0.3523856  1.535177 0.003246753 0.0241065 0.01552815 3112 tags=35%, list=25%, signal=26%
hsa04062 hsa04062 Chemokine signaling pathway     165       0.3754101  1.632867 0.003300330 0.0241065 0.01552815 1298 tags=21%, list=10%, signal=19%

kk	kk2

KE_3. KEGG Module over-representation test

mkk <- enrichMKEGG(gene = gene,
                   organism = 'hsa')

KE_4. KEGG Module Gene Set Enrichment Analysis

mkk2 <- gseMKEGG(geneList = geneList,
                 organism = 'hsa')

A Pipeline Example

Result from: edgeR
species: dme

library(clusterProfiler)
library(org.Dm.eg.db)
library(pathview)
library(reshape2)
library(stringr)
library(ggplot2)
library(enrichplot)

args = commandArgs(trailingOnly=TRUE)

SampleA= args[1]#"G1F1"
SampleB= args[2]#"G50F"
Pvalue = 0.01
logFC= 2
C_cut = 10
Level= 2

Sample_Dir = paste(SampleA,"_VS_",SampleB,"/",sep="")
for(i in c("GO", "KEGG", "WIKI", "Reactome")){
    dir.create(i)
    dir.create(paste(i, Sample_Dir,sep='/'))
}

Gene_list <- read.table(paste("RSEM_transcript/transcript.isoform.counts.matrix.", SampleA, "_vs_", SampleB,".edgeR.DE_results", sep=""))


ENTREZID = bitr(row.names(Gene_list), fromType="FLYBASE", toType="ENTREZID", OrgDb="org.Dm.eg.db")
FLYBASECG = bitr(row.names(Gene_list), fromType="FLYBASE", toType="FLYBASECG", OrgDb="org.Dm.eg.db")
SYMBOL = bitr(row.names(Gene_list), fromType="FLYBASE", toType="SYMBOL", OrgDb="org.Dm.eg.db")


Gene_list$ENTREZID <- ENTREZID$ENTREZID[match(row.names(Gene_list), ENTREZID$FLYBASE)]
Gene_list$FLYBASECG <- FLYBASECG$FLYBASECG[match(row.names(Gene_list), FLYBASECG$FLYBASE)]
Gene_list$SYMBOL <- SYMBOL$SYMBOL[match(row.names(Gene_list), SYMBOL$FLYBASE)]

KE2SY <-function(kk_GSEA, COL){
    if( nrow(kk_GSEA@result) >0){
        for(i in c(1:nrow(kk_GSEA@result))){
            LIST <- kk_GSEA@result[[COL]][i]
            kk_GSEA@result[[COL]][i] <- paste(Gene_list$SYMBOL[match(str_remove(str_split(LIST, "/")[[1]], "Dmel_"),Gene_list$FLYBASECG )], collapse = "/")
        }        
    }
    return(kk_GSEA)
}

EN2SY <-function(WikiP_enrich, COL){
    if(length(WikiP_enrich@result[[COL]])>0){
        for(i in c(1:length(WikiP_enrich@result[[COL]]))){
            LIST <- WikiP_enrich@result[[COL]][i]
            WikiP_enrich@result[[COL]][i] <- paste(Gene_list$SYMBOL[match(str_remove(str_split(LIST, "/")[[1]], "Dmel_"),Gene_list$ENTREZID )], collapse = "/")
        }
    }
    return(WikiP_enrich)
}

ggsave_GO <- function(NAME, LEN){
    BASE = 3.8
    RATE = 0.125
    if (BASE + RATE*LEN <= 40){
        W = BASE + RATE*LEN
    }else{
        W = 40
    }
    ggsave(NAME , w = W, h = 8.35, limitsize = FALSE )
}

ggsave_GO_enrich <- function(NAME, LEN){
    BASE = 2
    RATE = 0.1114
    if (BASE + RATE*LEN <= 40){
        H = BASE + RATE*LEN
    }else{
        H = 40
    }
    ggsave(NAME , w = 10, h = H, limitsize = FALSE)
}

ridgeplot_save <- function(NAME,LEN){
    BASE = 1.02
    RATE = 0.2
    if (BASE + RATE*LEN <= 40){
        H = BASE + RATE*LEN
    }else{
        H = 40
    }
    ggsave(NAME , w = 10, h = H, limitsize = FALSE)
}

## GO ontology
TMP <- Gene_list[abs(Gene_list$logFC) >=2,]
TMP <- TMP[TMP$PValue <= Pvalue,]


sig_genes = TMP$ENTREZID

gooc <- groupGO(gene     = sig_genes,
               OrgDb    = org.Dm.eg.db,
               ont      = "CC",
               level    = Level,
               readable = TRUE)
goom <- groupGO(gene     = sig_genes,
              OrgDb    = org.Dm.eg.db,
              ont      = "MF",
              level    = Level,
              readable = TRUE)
goob <- groupGO(gene     = sig_genes,
             OrgDb    = org.Dm.eg.db,
             ont      = "BP",
             level    = Level,
             readable = TRUE)

gooc@result$Group = "CC"
goom@result$Group = "MF"
goob@result$Group = "BP"


GO_TB <- rbind(gooc@result, goom@result, goob@result)
GO_TB <- GO_TB[GO_TB$Count!=0,]
File_name = paste("Ontology", SampleA,SampleB,Pvalue, logFC, sep="_" )
write.csv(GO_TB, paste("GO/",Sample_Dir, File_name, ".csv", sep="" ), row.names=F)

# [GO ontology plot]
ggplot(GO_TB, aes(x=Description, y=Count, fill=Group)) +
    geom_bar(stat = 'identity') +
    facet_grid(~Group, scales = 'free', space = 'free') + theme_bw() +
    theme(axis.text.x = element_text(angle = 270, hjust = 0, vjust = .5), legend.position = 'none', panel.grid =  element_blank(), strip.background = element_rect(fill = 'white'))
ggsave_GO(paste("GO/", Sample_Dir, File_name, ".png", sep="" ), nrow(GO_TB))

## Overrepresentation
File_name = paste("Enrichment", SampleA,SampleB,Pvalue, logFC, sep="_" )


ego <- enrichGO(gene          = sig_genes,
                universe      = Gene_list$ENTREZID,
                OrgDb         = org.Dm.eg.db,
                ont           = "All",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05,
        readable      = TRUE)
head(ego)
SIZE = as.numeric(as.character(as.data.frame(str_split_fixed(ego@result$GeneRatio, "/",2))[[1]]))/
as.numeric(as.character(as.data.frame(str_split_fixed(ego@result$BgRatio, "/",2))[[1]]))
ego@result$size = SIZE
write.csv(ego, paste("GO/", Sample_Dir, File_name, ".csv", sep="" ), row.names=F)

# [GO Enrichment plot]
ggplot(ego@result, aes(x=Count, y=Description)) +
    geom_point(aes(size=size, color=pvalue)) + theme_bw() +
    facet_grid(ONTOLOGY~., space = 'free', scales = 'free')

ggsave_GO_enrich(paste("GO/", Sample_Dir, File_name, ".png", sep="" ), nrow(ego@result))

geneList <- Gene_list$logFC
names(geneList) <- Gene_list$ENTREZID
geneList <- sort(geneList, decreasing = T)

for(GROUP in c("CC", "BP", "MF")){
    egocc <- enrichGO(gene          = sig_genes,
            universe      = Gene_list$ENTREZID,
            OrgDb         = org.Dm.eg.db,
            ont           = GROUP,
            pAdjustMethod = "BH",
            pvalueCutoff  = 0.01,
            qvalueCutoff  = 0.05,
            readable      = TRUE)

    write.csv(egocc, paste("GO/",Sample_Dir, File_name, "_", GROUP, ".csv", sep="" ), row.names=F)
    # [GO Network category]
    Plot_and_Save <-function(){
        goplot(egocc)
        ggsave( paste("GO/",Sample_Dir, File_name,  "_", GROUP, "_category", ".png", sep="" ),
            w = 20, h = 10.9 )
    # [GO Network Genes]
        cnetplot(egocc, foldChange=geneList)
        ggsave( paste("GO/", Sample_Dir, File_name,  "_", GROUP, "_genes", ".png", sep="" ),
        w = 20, h = 10.9 )
    }
    try(Plot_and_Save(),silent=TRUE)

}


# GO Gene Set Enrichment Analysis
File_name = paste("GSEA",Pvalue, logFC, sep="_" )

for(GROUP in c("CC", "BP", "MF")){
    GOgse_CC <- gseGO(geneList     = geneList,
                  OrgDb        = org.Dm.eg.db,
                  ont          = GROUP,
                  minGSSize    = 100,
                  maxGSSize    = 500,
                  eps = 1e-10,
                  pvalueCutoff = 0.05,
                  verbose      = FALSE)
    write.csv(GOgse_CC, paste("GO/",Sample_Dir, File_name, "_", GROUP, ".csv", sep="" ), row.names=F)

    ridgeplot(GOgse_CC)
    ridgeplot_save(paste("GO/",Sample_Dir, File_name, "_", GROUP,"_ridgeplot.png", sep="" ), nrow(GOgse_CC@result))
    Plot_and_Save <-function(){
        goplot(GOgse_CC)
        ggsave( paste("GO/",Sample_Dir, File_name,  "_", GROUP, "_category", ".png", sep="" ),
        w = 20, h = 10.9 )
        cnetplot(GOgse_CC, foldChange=geneList)
        ggsave( paste("GO/",Sample_Dir, File_name,  "_", GROUP, "_genes", ".png", sep="" ),
        w = 20, h = 10.9 )
    }
    try(Plot_and_Save(),silent=TRUE)

    # [GO GSEA upsetplot]
    upsetplot(GOgse_CC)
    ggsave( paste("GO/",Sample_Dir, File_name,  "_", GROUP, "_upset", ".png", sep="" ),
        w = 20, h = 10.9 )
    # [GO GSEA Escore]
    for (i in c(1:nrow(GOgse_CC@result))){
        gseaplot2(GOgse_CC, geneSetID = i, title = GOgse_CC$Description[i], color = 'salmon', pvalue_table = TRUE)
        ggsave(paste("GO/",Sample_Dir, File_name,  "_", str_split(GOgse_CC@result$ID[i],":")[[1]][2], "_GSEA", ".png", sep="" ), w = 7.5, h = 5)
    }
}

#########################
## KEGG                 #
#########################

# gene set enrichment analysis
File_name = paste("GSEA",Pvalue, logFC, sep="_" )
geneList_kk <- Gene_list$logFC
names(geneList_kk) <- paste("Dmel", Gene_list$FLYBASECG, sep="_")
geneList_kk <- sort(geneList_kk, decreasing = T)

kk_GSEA <- gseKEGG(geneList     = geneList_kk,
               organism     = 'dme',
               minGSSize    = 10,
               pvalueCutoff = 0.05,
               verbose      = FALSE)

for (i in c(1:nrow(kk_GSEA@result))){
    gseaplot2(kk_GSEA, geneSetID = i, title = kk_GSEA$Description[i], color = 'salmon', pvalue_table = TRUE)
    ggsave(paste("KEGG/",Sample_Dir, File_name,  "_", kk_GSEA@result$ID[i], "_GSEA", ".png", sep="" ), w = 7.5, h = 5)
}


kk_GSEA2 <- KE2SY(kk_GSEA, "core_enrichment")

write.csv(kk_GSEA2, paste("KEGG/",Sample_Dir, File_name, ".csv", sep="" ), row.names=F)


geneList_smb <- Gene_list$logFC
names(geneList_smb) <-Gene_list$SYMBOL
geneList_smb <- sort(geneList_smb, decreasing = T)

Plot_and_Save <-function(){
    cnetplot(kk_GSEA2, foldChange=geneList_smb)
    ggsave( paste("KEGG/",Sample_Dir, File_name, "_genes", ".png", sep="" ), w = 20, h = 10.9 )
}
try(Plot_and_Save(),silent=TRUE)


sig_genes_kk =  paste("Dmel", TMP$FLYBASECG, sep="_")

# over-representation analysis
File_name = paste("Enrichment", Pvalue, logFC, sep="_" )
kk_Enrich <- enrichKEGG(gene = sig_genes_kk,
                   universe = names(geneList_kk),
                   organism = 'dme',
                   pvalueCutoff = 0.05)
write.csv(kk_Enrich, paste("KEGG/",Sample_Dir, File_name, ".csv", sep="" ), row.names=F)

ggplot(kk_Enrich@result, aes(x=Count, y=Description)) +
    geom_point(aes(size=GeneRatio, color=pvalue)) + theme_bw()
ggsave_GO_enrich(paste("KEGG/",Sample_Dir, File_name, ".png", sep="" ), nrow(kk_Enrich@result))


Plot_and_Save <-function(){
    cnetplot(kk_Enrich, foldChange=geneList_kk)
    ggsave( paste("KEGG/",Sample_Dir, File_name, "_genes", ".png", sep="" ), w = 20, h = 10.9 )
}
try(Plot_and_Save(),silent=TRUE)


neList_kk <- Gene_list$logFC
names(geneList_kk) <-Gene_list$ENTREZID
geneList_kk <- sort(geneList_kk, decreasing = T)

library("pathview")
#dme04624 <- pathview(gene.data  = geneList_kk,
#                        pathway.id = "04624",
#                        species    = "dme",
#                        out.prefix = paste("KEGG/",Sample_Dir, sep=""),
#                        limit      =2.5)

## WikiPathways
File_name = paste("Enrichment", Pvalue, logFC, sep="_" )
WikiP_enrich <- enrichWP(sig_genes, universe = names(geneList_kk), organism = "Drosophila melanogaster")
ggplot(WikiP_enrich@result, aes(x=Count, y=Description)) +
    geom_point(aes(size=GeneRatio, color=pvalue)) + theme_bw()
ggsave_GO_enrich(paste("WIKI/",Sample_Dir, File_name, ".png", sep="" ), nrow(kk_Enrich@result))

WikiP_enrich2 <- EN2SY(WikiP_enrich, "geneID")
write.csv(WikiP_enrich2@result, paste("WIKI/",Sample_Dir, File_name, ".csv", sep="" ), row.names=F)

Plot_and_Save <-function(){
    cnetplot(WikiP_enrich2, foldChange=geneList_smb)
    ggsave( paste("WIKI/",Sample_Dir, File_name, "_genes", ".png", sep="" ), w = 20, h = 10.9 )
}
try(Plot_and_Save(),silent=TRUE)


File_name = paste("GSEA",Pvalue, logFC, sep="_" )
WikiP_gse <- gseWP(geneList, organism = "Drosophila melanogaster")
WikiP_gse2 <- EN2SY(WikiP_gse, "core_enrichment")

if (nrow(WikiP_gse@result) >0 ){
    for (i in c(1:nrow(WikiP_gse@result))){
        gseaplot2(WikiP_gse, geneSetID = i, title = WikiP_gse$Description[i], color = 'salmon', pvalue_table = TRUE)
        ggsave(paste("WIKI/",Sample_Dir, File_name,  "_", WikiP_gse@result$ID[i], "_GSEA", ".png", sep="" ), w = 7.5, h = 5)
    }
}

write.csv(WikiP_gse2@result, paste("WIKI/",Sample_Dir, File_name, ".csv", sep="" ), row.names=F)

Plot_and_Save <-function(){
    ridgeplot(WikiP_gse)
    ridgeplot_save(paste("WIKI/",Sample_Dir, File_name, "_", GROUP,"_ridgeplot.png", sep="" ), nrow(WikiP_gse@result))
}
try(Plot_and_Save(),silent=TRUE)



## Reactome Pathway
library(ReactomePA)
File_name = paste("Enrichment", Pvalue, logFC, sep="_" )
Reactome_enrich <- enrichPathway(gene= sig_genes, pvalueCutoff = 0.05,
    readable=TRUE, organism ="fly", universe = names(geneList))
write.csv(Reactome_enrich, paste("Reactome/",Sample_Dir, File_name, ".csv", sep="" ), row.names=F)

Reactome_enrich@result <- Reactome_enrich@result[Reactome_enrich@result$pvalue<=0.05,]
ggplot(Reactome_enrich@result, aes(x=Count, y=Description)) +
    geom_point(aes(size=GeneRatio, color=pvalue)) + theme_bw()
ggsave_GO_enrich(paste("Reactome/",Sample_Dir, File_name, ".png", sep="" ), nrow(kk_Enrich@result))


File_name = paste("GSEA",Pvalue, logFC, sep="_" )
Reactome_gse <- gsePathway(geneList,
                pvalueCutoff = 0.05,
                organism = "fly",
                pAdjustMethod = "BH")
# [Reactome_gse ridgeplot]
ridgeplot(Reactome_gse)
ridgeplot_save(paste("Reactome/",Sample_Dir, File_name, "_", GROUP,"_ridgeplot.png", sep="" ), nrow(Reactome_gse@result))
write.csv(Reactome_gse, paste("Reactome/",Sample_Dir, File_name, ".csv", sep="" ), row.names=F)


for (i in c(1:nrow(Reactome_gse@result))){
    gseaplot2(Reactome_gse, geneSetID = i, title = Reactome_gse$Description[i], color = 'salmon', pvalue_table = TRUE)
    ggsave(paste("Reactome/",Sample_Dir, File_name,  "_", Reactome_gse@result$ID[i], "_GSEA", ".png", sep="" ), w = 7.5, h = 5)
}

123
123



GO ontology plot

GO Enrichment plot

GO Network category

GO Network Genes

GO GSEA upsetplot

GO GSEA Escore

pathview

Reactome_gse ridgeplot

##############
####eg
#############

eg = bitr(x, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
eg = bitr(x, fromType="ENTREZID", toType="SYMBOL", OrgDb="org.Hs.eg.db")
head(eg)

gene <- eg[[2]]


barplot(ggio, drop=TRUE, showCategory=122)
#####################################################################



###bitr_kegg: converting biological IDs using KEGG API

data(x)
hg <- x[[1]]
head(hg)
eg2np <- bitr_kegg(hg, fromType='kegg', toType='ncbi-proteinid', organism='hsa')
head(eg2np)
bitr_kegg("Z5100", fromType="kegg", toType='ncbi-geneid', organism='ece')
bitr_kegg("Z5100", fromType="kegg", toType='ncbi-proteinid', organism='ece')
bitr_kegg("Z5100", fromType="kegg", toType='uniprot', organism='ece')

################################################################################
############ GO classification
################################################################################
gene <- names(geneList)[abs(geneList) > 2]

gobp <- groupGO(gene     = gene,
               OrgDb    = org.Hs.eg.db,
               ont      = "BP",
               level    = 2,
               readable = TRUE)

head(ggo)

go <- groupGO(gene = gene, OrgDb = org.Hs.eg.db, ont = "BP", level = 2, readable = TRUE)
barplot(go, drop=TRUE, showCategory=122)
##############################
GO over-representation test
###############################
ego1 <- enrichGO(gene =gene, universe= names(geneList), ont = "CC",pAdjustMethod = "BH",pvalueCutoff  = 0.01,qvalueCutoff = 0.05,readable = TRUE)
ego2 <- enrichGO(gene =gene, universe= names(geneList), ont = "BP",pAdjustMethod = "BH",pvalueCutoff  = 0.01,qvalueCutoff = 0.05,readable = TRUE)
ego3 <- enrichGO(gene =gene, universe= names(geneList), ont = "MF",pAdjustMethod = "BH",pvalueCutoff  = 0.01,qvalueCutoff = 0.05,readable = TRUE)

head(ego)
###################################
##################################
##################################
source("http://bioconductor.org/biocLite.R")
library(clusterProfiler)
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)





KEGG
kk <- enrichKEGG(gene = gene, organism = 'human', pvalueCutoff = 0.05)

head(kk)
barplot(kk, drop=TRUE, showCategory=122)


kk2 <- gseKEGG(geneList = geneList, organism = 'hsa', nPerm = 1000, minGSSize = 120, pvalueCutoff = 100,verbose = FALSE)
ead(kk2)

####
kk2 <- gseKEGG(geneList = geneList, organism = 'hsa', nPerm = 10, minGSSize = 10, pvalueCutoff = 100,verbose = FALSE)

gseaplot(kk2, geneSetID = "hsa05200")

hsa05205
hsa05165
##################################################################
#############pathview
##################################################################

library(clusterProfiler)
library(org.Hs.eg.db)
library("pathview")

hsa05418
 p  <- pathview(gene.data  = geneList,
                     pathway.id = "hsa04976",
                     species    = "hsa",
                     limit      = list(gene=max(abs(geneList)), cpd=1))

pathview(gene.data  = geneList, pathway.id = "hsa04976", species = "hsa", limit = list(gene=max(5), cpd=1))
pathview(gene.data  = geneList, pathway.id = i, species = "hsa", limit = list(gene=max(5), cpd=1))


####################################################################
####geneList
####################################################################


d = read.csv(your_csv_file)

d = read.table("geneList")
### assume 1st column is ID
### 2nd column is FC
### feature 1: numeric vector

geneList = d[,2]
names(geneList) = as.character(d[,1])  ## feature 2: named vector
geneList = sort(geneList, decreasing = TRUE)   ## feature 3: decreasing order
head(geneList)

eg = bitr(x, fromType="ENTREZID", toType="SYMBOL", annoDb="org.Hs.eg.db")

 p  <- pathview(gene.data  = geneList, pathway.id = "hsa04668", species    = "hsa", limit      = list(gene=max(abs(geneList)), cpd=1))


gene <- names(geneList)[abs(geneList) > 2]
gene.df <- bitr(gene, fromType = "ENTREZID",
        toType = c("ENSEMBL", "SYMBOL"),
        OrgDb = org.Hs.eg.db)
head(gene.df)




ego2 <- enrichGO(gene         = gene.df$ENSEMBL,
                OrgDb         = org.Hs.eg.db,
        keytype       = 'ENSEMBL',
                ont           = "CC",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05)
ego2 <- setReadable(ego2, OrgDb = org.Hs.eg.db)


biocLite("topGO")




barplot(ggo, drop=TRUE, showCategory=12)

###############################################################
##
###############################################################
geneList = d[,2]
names(geneList) = as.character(d[,1])  ## feature 2: named vector
geneList = sort(geneList, decreasing = TRUE)   ## feature 3: decreasing order
head(geneList)

kk2 <- gseKEGG(geneList     = geneList,
               organism     = 'hsa',
               nPerm        = 1000,
               minGSSize    = 120,
               pvalueCutoff = 0,
               verbose      = FALSE)
gseaplot(kk2, geneSetID = "hsa04668")


#######################
####write the results
#######################

write.csv(fortify(kk,showCategory=120),file="kk.matrix",quote=F,sep='\t')









##################################

a <- c("3","7","14","21")
b <- read.table("single_symol.matrix")

for(i in a)
 {
 x <- read.table("3")
 probes= x$V1
 probes1=match(probes,x$V1)
 probes2=match(probes,b$V1)
 sum(is.na(probes1))
 sum(is.na(probes1))
 List=data.frame(FC=(x$V4[probes1]* -1),S=b$V2[probes2],Co=b$V1[probes2])
 List=data.frame(Symbol=List$S, FC=List$FC )
 A<- List[order(List[,2],decreasing=F),]
 write.table(A,file=paste(i,"-tesst.txt",sep=''),sep='\t',quote=F,row.names=F)
 }

for (i in b[[1]])
 {
  probes= i
 probes1= match(probes,b$V1)
  c=b$V2[probes1]
 png(file=paste(i,"-",c,"-GSEA.png",sep=''),wi=400,he=400)
 gseaplot(kk2, geneSetID = i,title=paste(c,"0dpeVs7dpe"))
 dev.off()
 }


 for (i in b[[1]])
 {
 p  <- pathview(gene.data  = geneList,
 pathway.id = i,
  species    = "hsa",
                   limit      = list(gene=max(abs(geneList)), cpd=1))
 }


geneList=data.frame(List=List$Symbol,FC=List$FC* -1)


 png(file=paste(i,"-",title,".png",sep=''),wi=900,he=900)
 gseaplot(kk2, geneSetID = i,title= T)
 dev.off()}


a <- read.table("list")
 x <- a[[1]]
eg = bitr(x, fromType="ENTREZID", toType="SYMBOL", annoDb="org.Hs.eg.db")
dim(eg)
write.table(eg,file="",row.names=F,quote=F,sep='\t')

Error in installation

config.status: creating src/Makevars.tmp
config.status: creating src/Makevars
config.status: creating src/config.h
** libs
gfortran -fno-optimize-sibling-calls  -fpic  -g -O2  -c AMD/Source/amd.f -o AMD/Source/amd.o
make: gfortran: No such file or directory
make: *** [/usr/lib64/R/etc/Makeconf:195: AMD/Source/amd.o] Error 127
ERROR: compilation failed for package 'igraph'
* removing '/home/karobben/R/x86_64-pc-linux-gnu-library/4.0/igraph'
During startup - Warning message:

problem: lack of gcc-fortran

sudo pacman -S gcc-fortran

Error in using

fail to download KEGG data...
Error in download.KEGG.Path(species) : 
  'species' should be one of organisms listed in 'http://www.genome.jp/kegg/catalog/org_list.html'...
Calls: gseKEGG ... prepare_KEGG -> download_KEGG -> download.KEGG.Path
In addition: Warning message:
In download.file(url, method = method, ...) :
  URL 'https://rest.kegg.jp/link/hsa/pathway': status was 'Failure when receiving data from the peer'
Execution halted

This is such an annoying issue. The weird thing is that everything works fine when I use the code in the terminal, but as soon as I write the code into a script, it just doesn’t seem to work. The author suggests that the issue could be resolved by updating to the latest version. However, this would mean that I would also need to update either the Bioconductor or R base. I don’t want to update anything, I just want the job to be done. Thankfully, Slohr’s answer in 2022 provided a perfect solution to this problem.

## https://www.genome.jp/kegg/rest/keggapi.html
## kegg_link('hsa', 'pathway')
ttp_kegg_link <- function(target_db, source_db) {
    url <- paste0("https://rest.kegg.jp/link/", target_db, "/", source_db, collapse="")
    local_mydownload <- function (url, method, quiet = TRUE, ...) 
    {
        if (capabilities("libcurl")) {
            dl <- tryCatch(utils::download.file(url, quiet = quiet, 
                method = "libcurl", ...), error = function(e) NULL)
        }
        else {
            dl <- tryCatch(downloader::download(url, quiet = TRUE, 
                method = method, ...), error = function(e) NULL)
        }
        return(dl)
    }
    local_kegg_rest <- function (rest_url) 
    {
        message("Reading KEGG annotation online:\n")
        f <- tempfile()
        dl <- local_mydownload(rest_url, destfile = f)
        if (is.null(dl)) {
            message("fail to download KEGG data...")
            return(NULL)
        }
        content <- readLines(f)
        content %<>% strsplit(., "\t") %>% do.call("rbind", .)
        res <- data.frame(from = content[, 1], to = content[, 2])
        return(res)
    }
    local_kegg_rest(url)
}

ttp_kegg_list <- function(db) {
    url <- paste0("https://rest.kegg.jp/list/", db, collapse="")
    local_mydownload <- function (url, method, quiet = TRUE, ...) 
    {
        if (capabilities("libcurl")) {
            dl <- tryCatch(utils::download.file(url, quiet = quiet, 
                method = "libcurl", ...), error = function(e) NULL)
        }
        else {
            dl <- tryCatch(downloader::download(url, quiet = TRUE, 
                method = method, ...), error = function(e) NULL)
        }
        return(dl)
    }
    local_kegg_rest <- function (rest_url) 
    {
        message("Reading KEGG annotation online:\n")
        f <- tempfile()
        dl <- local_mydownload(rest_url, destfile = f)
        if (is.null(dl)) {
            message("fail to download KEGG data...")
            return(NULL)
        }
        content <- readLines(f)
        content %<>% strsplit(., "\t") %>% do.call("rbind", .)
        res <- data.frame(from = content[, 1], to = content[, 2])
        return(res)
    }
    local_kegg_rest(url)
}

rlang::env_unlock(env = asNamespace('clusterProfiler'))
rlang::env_binding_unlock(env = asNamespace('clusterProfiler'))
assign('kegg_link', ttp_kegg_link, envir = asNamespace('clusterProfiler'))
assign('kegg_list', ttp_kegg_list, envir = asNamespace('clusterProfiler'))
rlang::env_binding_lock(env = asNamespace('clusterProfiler'))
rlang::env_lock(asNamespace('clusterProfiler'))

clusterProfiler

https://karobben.github.io/2020/06/14/R/clusterProfiler/

Author

Karobben

Posted on

2020-06-14

Updated on

2024-01-11

Licensed under

#Bioinformatics R GO KEGG

clusterProfiler