library(PubMatrixR)
library(knitr)
library(kableExtra)
#library(msigdf)
library(dplyr)
library(pheatmap)
library(ggplot2)
PubMatrixR is an R package designed to analyze literature co-occurrence patterns by systematically searching PubMed and PMC databases. This vignette demonstrates how to:
This package is a fork of the original PubMatrixR by tslaird. Our gratitude to the original author.
For better performance and higher rate limits, we recommend obtaining an NCBI API key:
To obtain your free NCBI API key, visit: https://support.nlm.nih.gov/kbArticle/?pn=KA-05317
Once you have your API key, you can use it in PubMatrixR like this:
For this example, we’ll extract genes related to WNT signaling and obesity from the MSigDB database:
# Extract WNT-related genes
A <- msigdf::msigdf.human %>%
dplyr::filter(grepl(geneset, pattern = "wnt", ignore.case = TRUE)) %>%
dplyr::pull(symbol) %>%
unique()
# Extract obesity-related genes
B <- msigdf::msigdf.human %>%
dplyr::filter(grepl(geneset, pattern = "obesity", ignore.case = TRUE)) %>%
dplyr::pull(symbol) %>%
unique()
# Sample genes for demonstration (making them equal in length)
A <- sample(A, 10, replace = FALSE)
B <- sample(B, 10, replace = FALSE)
Now we’ll search for co-occurrences between our gene sets in PubMed literature:
# Run actual PubMatrix analysis
current_year <- format(Sys.Date(), "%Y")
result <- PubMatrix(A = A,
B = B,
Database = "pubmed",
daterange = c(2020, current_year),
outfile = "pubmatrix_result")
#> [1] "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&datetype=pdat&mindate=2020&maxdate=2025"
#> [1] "<b><a href=\"https://www.ncbi.nlm.nih.gov/pubmed/?term=2020:2025[DP]+AND+"
The co-occurrence matrix shows the number of publications mentioning each pair of genes:
kable(result,
caption = "Co-occurrence Matrix: WNT Genes vs Obesity Genes (Publication Counts)",
align = "c",
format = "html") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center") %>%
kableExtra::add_header_above(c(" " = 1, "Obesity Genes" = length(A)))
| WNT1 | WNT2 | WNT3A | WNT5A | WNT7B | CTNNB1 | DVL1 | |
|---|---|---|---|---|---|---|---|
| LEPR | 3 | 0 | 0 | 2 | 0 | 1 | 0 |
| ADIPOQ | 1 | 0 | 0 | 3 | 0 | 5 | 0 |
| PPARG | 0 | 2 | 1 | 5 | 1 | 19 | 0 |
| TNF | 43 | 4 | 61 | 57 | 3 | 115 | 1 |
| IL6 | 41 | 3 | 47 | 73 | 5 | 86 | 2 |
| ADRB2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| INSR | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
Let’s first examine which genes have the highest publication counts:
# Create data frame for List A genes (rows) colored by List B genes (columns)
a_genes_data <- data.frame(
gene = rownames(result),
total_pubs = rowSums(result),
stringsAsFactors = FALSE
)
# Add color coding based on max overlap with B genes
a_genes_data$max_b_gene <- apply(result, 1, function(x) colnames(result)[which.max(x)])
a_genes_data$max_overlap <- apply(result, 1, max)
# Create data frame for List B genes (columns) colored by List A genes (rows)
b_genes_data <- data.frame(
gene = colnames(result),
total_pubs = colSums(result),
stringsAsFactors = FALSE
)
# Add color coding based on max overlap with A genes
b_genes_data$max_a_gene <- apply(result, 2, function(x) rownames(result)[which.max(x)])
b_genes_data$max_overlap <- apply(result, 2, max)
# Plot A genes colored by their strongest B gene partner
p1 <- ggplot(a_genes_data, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_b_gene)) +
geom_bar(stat = "identity")+
coord_flip() +
labs(title = "List A Genes by Publication Count",
subtitle = "Colored by strongest List B gene partner",
x = "Genes (List A)",
y = "Total Publications",
fill = "Strongest B Partner") +
theme_minimal() +
theme(legend.position = "bottom") +
scale_fill_brewer(palette = "Set1")
# Plot B genes colored by their strongest A gene partner
p2 <- ggplot(b_genes_data, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_a_gene)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "List B Genes by Publication Count",
subtitle = "Colored by strongest List A gene partner",
x = "Genes (List B)",
y = "Total Publications",
fill = "Strongest A Partner")+
theme_minimal() +
theme(legend.position = "bottom") +
scale_fill_brewer(palette = "Set1")
print(p1)
The heatmap displays overlap percentages calculated from the publication co-occurrence counts:
plot_pubmatrix_heatmap(result,
title = "WNT-Obesity Gene Overlap Percentages",
show_numbers = TRUE ,
width = 16,
height = 14)
Overlap percentage heatmap with values displayed in each cell
For a cleaner visualization without numbers, useful for presentations:
plot_pubmatrix_heatmap(result,
title = "WNT-Obesity Gene Co-occurrence (Clean)",
show_numbers = FALSE,
width = 16,
height = 14)
Co-occurrence heatmap without numbers for better visual clarity
Asymmetric lists
A <- c("NCOR2", "NCSTN", "NKD1", "NOTCH1", "NOTCH4", "NUMB", "PPARD", "PSEN2", "PTCH1", "RBPJ", "SKP2", "TCF7", "TP53")
B <- c("EIF1", "EIF1AX", "EIF2B1", "EIF2B2", "EIF2B3", "EIF2B4", "EIF2B5", "EIF2S1", "EIF2S2", "EIF2S3", "ELAVL1")
# Run actual PubMatrix analysis
current_year <- format(Sys.Date(), "%Y")
result <- PubMatrix(A = A,
B = B,
Database = "pubmed",
daterange = c(2020, current_year),
outfile = "pubmatrix_result")
#> [1] "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&datetype=pdat&mindate=2020&maxdate=2025"
#> [1] "<b><a href=\"https://www.ncbi.nlm.nih.gov/pubmed/?term=2020:2025[DP]+AND+"
The co-occurrence matrix shows the number of publications mentioning each pair of genes:
kable(result,
caption = "Co-occurrence Matrix: Longer Lists (Publication Counts)",
align = "c",
format = "html") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center") %>%
kableExtra::add_header_above(c(" " = 1, "Genes" = length(A)))
| NCOR2 | NCSTN | NKD1 | NOTCH1 | NOTCH4 | NUMB | PPARD | PSEN2 | PTCH1 | RBPJ | SKP2 | TCF7 | TP53 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| EIF1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| EIF1AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 |
| EIF2B1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| EIF2B2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| EIF2B3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| EIF2B4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| EIF2B5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| EIF2S1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
| EIF2S2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| EIF2S3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ELAVL1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
# Create data frame for List A genes (rows) colored by List B genes (columns)
a_genes_data2 <- data.frame(
gene = rownames(result),
total_pubs = rowSums(result),
stringsAsFactors = FALSE
)
# Add color coding based on max overlap with B genes
a_genes_data2$max_b_gene <- apply(result, 1, function(x) colnames(result)[which.max(x)])
a_genes_data2$max_overlap <- apply(result, 1, max)
# Create data frame for List B genes (columns) colored by List A genes (rows)
b_genes_data2 <- data.frame(
gene = colnames(result),
total_pubs = colSums(result),
stringsAsFactors = FALSE
)
# Add color coding based on max overlap with A genes
b_genes_data2$max_a_gene <- apply(result, 2, function(x) rownames(result)[which.max(x)])
b_genes_data2$max_overlap <- apply(result, 2, max)
# Plot A genes colored by their strongest B gene partner
p3 <- ggplot(a_genes_data2, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_b_gene)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "List A Genes by Publication Count (Asymmetric)",
subtitle = "Colored by strongest List B gene partner",
x = "Genes (List A)",
y = "Total Publications",
fill = "Strongest B Partner") +
theme_minimal() +
theme(legend.position = "bottom")+
scale_fill_brewer(palette = "Set1")
# Plot B genes colored by their strongest A gene partner
p4 <- ggplot(b_genes_data2, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_a_gene)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "List B Genes by Publication Count (Asymmetric)",
subtitle = "Colored by strongest List A gene partner",
x = "Genes (List B)",
y = "Total Publications",
fill = "Strongest A Partner") +
theme_minimal() +
theme(legend.position = "bottom")+
scale_fill_brewer(palette = "Set1")
print(p3)
The heatmap displays overlap percentages calculated from the publication co-occurrence counts:
plot_pubmatrix_heatmap(result,
title = "Asymmetric Gene Lists Overlap Percentages",
show_numbers = TRUE,
width = 16,
height = 14)
Overlap percentage heatmap with values displayed in each cell
This vignette demonstrated:
The resulting matrices and visualizations can help identify: - Strong literature connections between gene sets - Potential research gaps (low co-occurrence pairs) - Patterns in publication trends over time - Most studied genes and their strongest literature partners
sessionInfo()
#> R version 4.5.1 (2025-06-13)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS Tahoe 26.0.1
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> time zone: Europe/London
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] ggplot2_4.0.0 pheatmap_1.0.13 dplyr_1.1.4 kableExtra_1.4.0
#> [5] knitr_1.50 PubMatrixR_2.0
#>
#> loaded via a namespace (and not attached):
#> [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.1 tidyselect_1.2.1
#> [5] xml2_1.4.0 stringr_1.5.2 parallel_4.5.1 jquerylib_0.1.4
#> [9] systemfonts_1.3.1 scales_1.4.0 textshaping_1.0.4 yaml_2.3.10
#> [13] fastmap_1.2.0 R6_2.6.1 labeling_0.4.3 generics_0.1.4
#> [17] curl_7.0.0 tibble_3.3.0 svglite_2.2.1 bslib_0.9.0
#> [21] pillar_1.11.1 RColorBrewer_1.1-3 rlang_1.1.6 cachem_1.1.0
#> [25] stringi_1.8.7 xfun_0.53 S7_0.2.0 sass_0.4.10
#> [29] viridisLite_0.4.2 cli_3.6.5 withr_3.0.2 magrittr_2.0.4
#> [33] digest_0.6.37 grid_4.5.1 pbapply_1.7-4 rstudioapi_0.17.1
#> [37] lifecycle_1.0.4 vctrs_0.6.5 evaluate_1.0.5 glue_1.8.0
#> [41] farver_2.1.2 rmarkdown_2.30 tools_4.5.1 pkgconfig_2.0.3
#> [45] htmltools_0.5.8.1
# Additional system details
cat("Date generated:", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"), "\n")
#> Date generated: 2025-10-20 21:58:12 BST
cat("R version:", R.version.string, "\n")
#> R version: R version 4.5.1 (2025-06-13)
cat("Platform:", R.version$platform, "\n")
#> Platform: aarch64-apple-darwin20
cat("Operating System:", Sys.info()["sysname"], Sys.info()["release"], "\n")
#> Operating System: Darwin 25.0.0
cat("User:", Sys.info()["user"], "\n")
#> User: entm