Commit 861e0fc3 authored by liiskolb's avatar liiskolb
Browse files

add short-link

parent 6eca4eef
......@@ -50,6 +50,7 @@ gp_globals$base_url = "http://biit.cs.ut.ee/gprofiler"
#' GO (GO:BP, GO:MF, GO:CC to select a particular GO branch), KEGG, REAC, TF,
#' MIRNA, CORUM, HP, HPA, WP. Please see the g:GOSt web tool for the comprehensive
#' list and details on incorporated data sources.
#' @param as_short_link indicator to return results as short-link to the g:Profiler web tool. If set to TRUE, then the function returns the results URL as a character string instead of the data.frame.
#' @return A named list where 'result' contains data.frame with the enrichment analysis results and 'meta' contains metadata needed for Manhattan plot. If the input
#' consisted of several lists the corresponding list is indicated with a variable
#' 'query'.
......@@ -58,6 +59,8 @@ gp_globals$base_url = "http://biit.cs.ut.ee/gprofiler"
#' The latter conveys info about the intersecting genes between the corresponding query and term.
#'
#' The result fields are further described in \url{https://biit.cs.ut.ee/gprofiler_beta/page/apis#gost_query_results}
#'
#' If 'as_short_link' is set to TRUE, then the result is a character short-link to see and share corresponding results via the g:Profiler web tool.
#' @author Liis Kolberg <liis.kolberg@@ut.ee>, Uku Raudvere <uku.raudvere@@ut.ee>
#' @examples
#' gostres <- gost(c("X:1000:1000000", "rs17396340", "GO:0005005", "ENSG00000156103", "NLRP1"))
......@@ -76,7 +79,8 @@ gost <- function(query,
domain_scope = c("annotated", "known", "custom", "custom_annotated"),
custom_bg = NULL,
numeric_ns = "",
sources = NULL
sources = NULL,
as_short_link = FALSE
) {
url = paste0(file.path(gp_globals$base_url, "api", "gost", "profile"), "/")
......@@ -101,13 +105,10 @@ gost <- function(query,
query = query[!is.na(query)]
}
# Parameters
correction_methods <- c("g_SCS", "bonferroni", "fdr", "false_discovery_rate", "gSCS", "analytical")
if (length(correction_method)>1){
correction_method = "g_SCS"
}
## evaluate choices
correction_method <- match.arg(correction_method, correction_methods)
correction_method <- match.arg(correction_method)
domain_scope <- match.arg(domain_scope)
if (startsWith(organism, "gp__")){
message("Detected custom GMT source request")
......@@ -131,40 +132,73 @@ gost <- function(query,
}
t <- ifelse(length(custom_bg) == 1, custom_bg <- jsonlite::unbox(custom_bg), custom_bg <- custom_bg)
}else{
if (all(domain_scope %in% c("custom_annotated", "custom"))){
if (domain_scope %in% c("custom_annotated", "custom")){
stop("Domain scope is set to custom, but no background genes detected from the input.")
}
}
domain_scopes <- c("annotated", "known", "custom", "custom_annotated")
if (length(domain_scope) > 1){
domain_scope = "annotated"
}
domain_scope <- match.arg(domain_scope, domain_scopes)
if (as_short_link){
# query should be a string in this case
body <- jsonlite::toJSON((
list(
organism = jsonlite::unbox(organism),
query = query,
sources = sources,
user_threshold = jsonlite::unbox(user_threshold),
all_results = jsonlite::unbox(!significant),
ordered = jsonlite::unbox(ordered_query),
no_evidences = jsonlite::unbox(!evcodes),
combined = jsonlite::unbox(multi_query),
measure_underrepresentation = jsonlite::unbox(measure_underrepresentation),
no_iea = jsonlite::unbox(exclude_iea),
domain_scope = jsonlite::unbox(domain_scope),
numeric_ns = jsonlite::unbox(numeric_ns),
significance_threshold_method = jsonlite::unbox(correction_method),
background = custom_bg,
output = jsonlite::unbox("json")
)
),
auto_unbox = FALSE,
null = "null")
# Headers
if(!is.null(names(query))){
query2 = paste0(unlist(lapply(names(query), function(x) paste(">", x, "\n", paste0(query[[x]], collapse = " ")))), collapse = "\n")
multi_query = TRUE
}else{
query2 = paste0(query, collapse = " ")
}
url = file.path("http://biit.cs.ut.ee", "gplink", "l")
body <- jsonlite::toJSON((
list(
url = jsonlite::unbox(file.path(gprofiler2::get_base_url(), "gost")),
payload = {
list(
organism = jsonlite::unbox(organism),
query = jsonlite::unbox(query2),
sources = sources,
user_threshold = jsonlite::unbox(user_threshold),
all_results = jsonlite::unbox(!significant),
ordered = jsonlite::unbox(ordered_query),
no_evidences = jsonlite::unbox(!evcodes),
combined = jsonlite::unbox(multi_query),
measure_underrepresentation = jsonlite::unbox(measure_underrepresentation),
no_iea = jsonlite::unbox(exclude_iea),
domain_scope = jsonlite::unbox(domain_scope),
numeric_ns = jsonlite::unbox(numeric_ns),
significance_threshold_method = jsonlite::unbox(correction_method),
background = custom_bg)
}
)
),
auto_unbox = FALSE,
null = "null")
} else {
body <- jsonlite::toJSON((
list(
organism = jsonlite::unbox(organism),
query = query,
sources = sources,
user_threshold = jsonlite::unbox(user_threshold),
all_results = jsonlite::unbox(!significant),
ordered = jsonlite::unbox(ordered_query),
no_evidences = jsonlite::unbox(!evcodes),
combined = jsonlite::unbox(multi_query),
measure_underrepresentation = jsonlite::unbox(measure_underrepresentation),
no_iea = jsonlite::unbox(exclude_iea),
domain_scope = jsonlite::unbox(domain_scope),
numeric_ns = jsonlite::unbox(numeric_ns),
significance_threshold_method = jsonlite::unbox(correction_method),
background = custom_bg,
output = jsonlite::unbox("json")
)
),
auto_unbox = FALSE,
null = "null")
}
# Headers
headers <-
list("Accept" = "application/json",
"Content-Type" = "application/json",
......@@ -196,6 +230,12 @@ gost <- function(query,
}
res <- jsonlite::fromJSON(txt)
if (as_short_link){
shortlink = paste0('https://biit.cs.ut.ee/gplink/l/', res$result)
return(shortlink)
}
df = res$result
meta = res$meta
......
......@@ -10,7 +10,8 @@ gost(query, organism = "hsapiens", ordered_query = FALSE,
user_threshold = 0.05, correction_method = c("g_SCS", "bonferroni",
"fdr", "false_discovery_rate", "gSCS", "analytical"),
domain_scope = c("annotated", "known", "custom", "custom_annotated"),
custom_bg = NULL, numeric_ns = "", sources = NULL)
custom_bg = NULL, numeric_ns = "", sources = NULL,
as_short_link = FALSE)
}
\arguments{
\item{query}{vector, or a (named) list of vectors for multiple queries, that can consist of mixed types of gene IDs (proteins, transcripts, microarray IDs, etc), SNP IDs, chromosomal intervals or term IDs.}
......@@ -51,6 +52,8 @@ This parameter does not work if 'multi_query' is set to TRUE.}
GO (GO:BP, GO:MF, GO:CC to select a particular GO branch), KEGG, REAC, TF,
MIRNA, CORUM, HP, HPA, WP. Please see the g:GOSt web tool for the comprehensive
list and details on incorporated data sources.}
\item{as_short_link}{indicator to return results as short-link to the g:Profiler web tool. If set to TRUE, then the function returns the results URL as a character string instead of the data.frame.}
}
\value{
A named list where 'result' contains data.frame with the enrichment analysis results and 'meta' contains metadata needed for Manhattan plot. If the input
......@@ -61,6 +64,8 @@ A named list where 'result' contains data.frame with the enrichment analysis res
The latter conveys info about the intersecting genes between the corresponding query and term.
The result fields are further described in \url{https://biit.cs.ut.ee/gprofiler_beta/page/apis#gost_query_results}
If 'as_short_link' is set to TRUE, then the result is a character short-link to see and share corresponding results via the g:Profiler web tool.
}
\description{
Interface to the g:Profiler tool g:GOSt (\url{https://biit.cs.ut.ee/gprofiler/gost}) for functional enrichments analysis of gene lists.
......
......@@ -5,7 +5,9 @@ title: "Gene list functional enrichment analysis and namespace conversion with g
output:
prettydoc::html_pretty:
theme: cayman
#theme: architect
highlight: github
toc: true
vignette: >
%\VignetteIndexEntry{gprofiler2}
%\VignetteEngine{knitr::rmarkdown}
......@@ -48,7 +50,7 @@ The main tools in [g:Profiler](https://biit.cs.ut.ee/gprofiler) are:
* [g:Convert](https://biit.cs.ut.ee/gprofiler/convert) - gene/protein/transcript identifier conversion across various namespaces
* [g:Orth](https://biit.cs.ut.ee/gprofiler/orth) - orthology search across species
The input for any of the tools can consist of mixed types of gene identifiers, SNP rs-IDs, chromosomal intervals or term IDs. The gene IDs from chromosomal regions are retrieved automatically. The gene doesn't need to fit the region fully. The format for chromosome regions is chr:region\_start:region\_end, e.g. *X:1:2000000*. In case of term IDs like [GO:0007507](http://www.informatics.jax.org/vocab/gene_ontology/GO:0007507) (heart development), g:Profiler uses all the genes annotated to that term as an input (in this case about six hundred human genes associated to heart development). Fully numeric identifiers need to be prefixed with the corresponding namespace. g:Profiler will automatically prefix all the detected numeric IDs using the prefix determined by the selected numeric namespace parameter.
The input for any of the tools can consist of mixed types of gene identifiers, SNP rs-IDs, chromosomal intervals or term IDs. The gene IDs from chromosomal regions are retrieved automatically. The gene doesn't need to fit the region fully. The format for chromosome regions is chr:region\_start:region\_end, e.g. X\:1\:2000000. In case of term IDs like [GO:0007507](http://www.informatics.jax.org/vocab/gene_ontology/GO:0007507) (heart development), g:Profiler uses all the genes annotated to that term as an input (in this case about six hundred human genes associated to heart development). Fully numeric identifiers need to be prefixed with the corresponding namespace. g:Profiler will automatically prefix all the detected numeric IDs using the prefix determined by the selected numeric namespace parameter.
* [g:SNPense](https://biit.cs.ut.ee/gprofiler/snpense) - mapping SNP rs-identifiers to chromosome positions, protein coding genes and variant effects
......@@ -94,7 +96,7 @@ In case of Gene Ontology (GO), the `exclude_iea = TRUE` would exclude the electr
In order to measure under-representation instead of over-representation set `measure_underrepresentation = TRUE`.
By default, the `user_threshold = 0.05` which defines a custom p-value threshold for the displayed results. Results with larger p-values are excluded. This is a possibility to additionally filter the results and threshold 0.05 means that all the significant results are shown. However, this threshold does not decide over the significance.
By default, the `user_threshold = 0.05` which defines a custom p-value significance threshold for the results. Results with smaller p-value are tagged as significant. We don't recommend to set it higher than 0.05.
In order to reduce the amount of false positives, a [multiple testing correction method](https://biit.cs.ut.ee/gprofiler/page/docs#significance_threhshold) is applied to the enrichment p-values. By default, our tailor-made algorithm g\:SCS is used (`correction_method = "gSCS"` with synonyms `g_SCS` and `analytical`), but there are also options to apply the Bonferroni correction (`correction_method = "bonferroni"`) or FDR (`correction_method = "fdr"`). The adjusted p-values are reported in the results.
......@@ -112,10 +114,10 @@ gostres <- gost(query = c("X:1000:1000000", "rs17396340", "GO:0005005", "ENSG000
measure_underrepresentation = FALSE, evcodes = FALSE,
user_threshold = 0.05, correction_method = "g_SCS",
domain_scope = "annotated", custom_bg = NULL,
numeric_ns = "", sources = NULL)
numeric_ns = "", sources = NULL, as_short_link = FALSE)
```
The result is a named `list` where *"result"* is a `data.frame` with the enrichment analysis results and *"meta"* containing a named `list` with all the metadata for the query.
The result is a named `list` where **"result"** is a `data.frame` with the enrichment analysis results and **"meta"** containing a named `list` with all the metadata for the query.
```{r}
names(gostres)
......@@ -150,9 +152,9 @@ The result `data.frame` contains the following columns:
names(gostres$meta)
```
The query parameters are listed in the *"query_metadata"* listing of the metadata object. The *"result_metadata"* includes the statistics of data sources that are used in the enrichment testing. This includes the *"domain_size"* showing the number of genes annotated to this domain. The *"number_of_terms"* indicating the number of terms g:Profiler has in the database for this source and the nominal significance *"threshold"* for this source. The *"genes_metadata"* shows the specifics of the query genes (failed, ambiguous or duplicate inputs) and their mappings to the ENSG namespace. In addition, the query time and the used g:Profiler data version are shown in the metadata.
The query parameters are listed in the **"query_metadata"** part of the metadata object. The **"result_metadata"** includes the statistics of data sources that are used in the enrichment testing. This includes the **"domain_size"** showing the number of genes annotated to this domain. The **"number_of_terms"** indicating the number of terms g:Profiler has in the database for this source and the nominal significance **"threshold"** for this source. The **"genes_metadata"** shows the specifics of the query genes (failed, ambiguous or duplicate inputs) and their mappings to the ENSG namespace. In addition, the query time and the used g:Profiler data version are shown in the metadata.
The parameter `evcodes = TRUE` includes the evidence codes to the results. In addition, a column *"intersection"* will appear to the results showing the input gene IDs that intersect with the corresponding functional term. Note that his parameter can decrease the performance and make the query slower.
The parameter `evcodes = TRUE` includes the evidence codes to the results. In addition, a column **"intersection"** will appear to the results showing the input gene IDs that intersect with the corresponding functional term. Note that his parameter can decrease the performance and make the query slower.
```{r}
gostres2 <- gost(query = c("X:1000:1000000", "rs17396340", "GO:0005005", "ENSG00000156103", "NLRP1"),
......@@ -173,11 +175,20 @@ The result `data.frame` will include additional columns:
* evidence_codes - a lists of all evidence codes for the intersecting genes between input and the term. The evidences are separated by comma for each gene.
* intersection - a comma separated list of genes from the query that are annotated to the corresponding term
The query results can also be gathered into a short-link to the g:Profiler web tool. For that, set the parameter `as_short_link = TRUE`. In this case, the function `gost()` returns only the web tool link to the results as a character string. For example, this is useful when you discover an interesting result you want to instantly share with your colleagues. Then you can just programmatically generate the short-link and copy it to your colleagues.
```{r eval = FALSE}
gostres_link <- gost(query = c("X:1000:1000000", "rs17396340", "GO:0005005", "ENSG00000156103", "NLRP1"),
as_short_link = TRUE)
```
This query returns a short-link of form [https://biit.cs.ut.ee/gplink/l/HfapQyB5TJ](https://biit.cs.ut.ee/gplink/l/HfapQyB5TJ).
### Multiple queries
The function `gost` also allows to perform enrichment on multiple input gene lists. Multiple queries are automatically detected if the input `query` is a `list` of vectors with gene identifiers and the results are combined into identical `data.frame` as in case of single query.
```{r warning = FALSE}
```{r}
multi_gostres1 <- gost(query = list("chromX" = c("X:1000:1000000", "rs17396340",
"GO:0005005", "ENSG00000156103", "NLRP1"),
"chromY" = c("Y:1:10000000", "rs17396340",
......@@ -189,11 +200,11 @@ multi_gostres1 <- gost(query = list("chromX" = c("X:1000:1000000", "rs17396340",
head(multi_gostres1$result)
```
The column **query** in the result `data.frame` will now contain the corresponding name for the query. If no name is specified, then the query name is defined as the order of query with the prefix "query_".
The column **"query"** in the result `data.frame` will now contain the corresponding name for the query. If no name is specified, then the query name is defined as the order of query with the prefix "query\_".
Another option for multiple gene lists is setting the parameter `multiquery = TRUE`. Then the results from all of the input queries are grouped according to term IDs for better comparison.
```{r warning = FALSE}
```{r}
multi_gostres2 <- gost(query = list("chromX" = c("X:1000:1000000", "rs17396340",
"GO:0005005", "ENSG00000156103", "NLRP1"),
"chromY" = c("Y:1:10000000", "rs17396340",
......@@ -230,7 +241,7 @@ The enrichment results are visualized with a [Manhattan-like-plot](https://biit.
gostplot(gostres, capped = TRUE, interactive = TRUE)
```
The x-axis represents the functional terms that are grouped and color-coded according to data sources and positioned according to the fixed **source_order**. The order is defined in a way that terms that are closer to each other in the source hierarchy are also next to each other in the Manhattan plot.
The x-axis represents the functional terms that are grouped and color-coded according to data sources and positioned according to the fixed **"source_order"**. The order is defined in a way that terms that are closer to each other in the source hierarchy are also next to each other in the Manhattan plot.
The source colors are adjustable with the parameter `pal` that defines the color map with a named `list`.
The y-axis shows the adjusted p-values in negative log10 scale. Every circle is one term and is sized according to the term size, i.e larger terms have larger circles. If `interactive = TRUE`, then an interactive plot is returned using the `plotly` package.
......@@ -245,7 +256,7 @@ p <- gostplot(gostres, capped = FALSE, interactive = FALSE)
p
```
The function `publish_gostplot` takes the static plot object as an input and enables to highlight a selection of interesting terms from the results with numbers and table of results. These can be set with parameter `highlight_terms` listing the term IDs in a `vector` or as a `data.frame` with column "term_id" such as a subset of the result `data.frame`.
The function `publish_gostplot` takes the static plot object as an input and enables to highlight a selection of interesting terms from the results with numbers and table of results. These can be set with parameter `highlight_terms` listing the term IDs in a `vector` or as a `data.frame` with column **"term_id"** such as a subset of the result `data.frame`.
```{r fig.width = 9.5}
pp <- publish_gostplot(p, highlight_terms = c("GO:0048013", "REAC:R-HSA-3928663"),
......@@ -266,7 +277,7 @@ pt <- publish_gosttable(gostres, highlight_terms = gostres$result[c(1:2,10,100:1
filename = NULL)
```
The parameter `use_colors = FALSE` indicates that the p-values column should not be highlighted with background colors. The `show_columns` is used to list the names of additional columns to show in the table in addition to the "term_id" and "p_value".
The parameter `use_colors = FALSE` indicates that the p-values column should not be highlighted with background colors. The `show_columns` is used to list the names of additional columns to show in the table in addition to the **"term_id"** and **"p_value"**.
The same functions work also in case of multiquery results showing multiple Manhattan plots on top of each other:
......@@ -300,13 +311,12 @@ Available data sources and their abbreviations are:
### Custom data sources with `upload_GMT_file`
In addition to the available GO, KEGG, etc data sources, users can upload their own custom data source using the Gene Matrix Transposed file format (GMT). The file format is described in [here](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29). The users can compose the files themselves or use pre-compiled gene sets from available dedicated websites like Molecular Signatures Database ([MSigDB](http://software.broadinstitute.org/gsea/msigdb/genesets.jsp)), etc.
In addition to the available GO, KEGG, etc data sources, users can upload their own custom data source using the Gene Matrix Transposed file format (GMT). The file format is described in [here](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29). The users can compose the files themselves or use pre-compiled gene sets from available dedicated websites like Molecular Signatures Database ([MSigDB](http://software.broadinstitute.org/gsea/msigdb/genesets.jsp)), etc. The GMT files for g:Profiler default sources (except for KEGG and Transfac as we are restricted by data source licenses) are downloadabale from the Data sources section in [g:Profiler](https://biit.cs.ut.ee/gprofiler/gost).
`upload_GMT_file` enables to upload GMT file(s). The input `gmtfile` is the filename of the GMT file together with the path to the file. The input can also be several GMT files compressed into a ZIP file.
The file extension should be **.gmt** or **.zip** in case of multiple GMT files. The uploaded filename is used to define the source name in the enrichment results.
For example, using the BioCarta gene sets downloaded from the [
MSigDB Collections] (http://software.broadinstitute.org/gsea/msigdb/collections.jsp#C7)
For example, using the BioCarta gene sets downloaded from the [MSigDB Collections] (http://software.broadinstitute.org/gsea/msigdb/collections.jsp#C7)
```{r eval = F}
download.file(url = "http://software.broadinstitute.org/gsea/resources/msigdb/7.0/c2.cp.biocarta.v7.0.symbols.gmt", destfile = "extdata/biocarta.gmt")
......@@ -316,7 +326,7 @@ download.file(url = "http://software.broadinstitute.org/gsea/resources/msigdb/7.
upload_GMT_file(gmtfile = "extdata/biocarta.gmt")
```
The result is a string that denotes the unique ID of the uploaded data source in the [g:Profiler](https://biit.cs.ut.ee/gprofiler) database. In this examaple, the ID is **gp__TEXF_hZLM_d18**.
The result is a string that denotes the unique ID of the uploaded data source in the [g:Profiler](https://biit.cs.ut.ee/gprofiler) database. In this examaple, the ID is **gp\_\_TEXF\_hZLM\_d18**.
After the upload, this ID can be used as a value for the parameter `organism` in the `gost` function. The input `query` should consist of identifiers that are available in the GMT file. Note that all the genes in the GMT file define the domain size and therefore it is not sufficient to include only the selection of interesting terms to the file.
......@@ -327,7 +337,8 @@ custom_gostres <- gost(query = c("MAPK3", "PIK3C2G", "HRAS", "PIK3R1", "MAP2K1",
head(custom_gostres$result)
```
There is no need to repeatedly upload the same GMT file(s) every time before the enrichment analysis. This can only be uploaded once and then the ID can be used in any further enrichment analyses that are based on that custom source. The same ID can also be used in the [web tool](https://biit.cs.ut.ee/gprofiler) as a token under the Custom GMT options. For example, the same query in the web tool is available from [https://biit.cs.ut.ee/gplink/l/jh3HdbUWQZ](https://biit.cs.ut.ee/gplink/l/jh3HdbUWQZ).
There is no need to repeatedly upload the same GMT file(s) every time before the enrichment analysis. This can only be uploaded once and then the ID can be used in any further enrichment analyses that are based on that custom source. The same ID can also be used in the [web tool](https://biit.cs.ut.ee/gprofiler) as a token under the Custom GMT options.
For example, the same query in the web tool is available from [https://biit.cs.ut.ee/gplink/l/jh3HdbUWQZ](https://biit.cs.ut.ee/gplink/l/jh3HdbUWQZ).
----
......@@ -412,15 +423,10 @@ The result is a `data.frame` with columns:
* gene_names - corresponding gene names; can contain a `list` with multiple values
* variants - a `data.frame` with corresponding variant effects
----
## Generating short links to the web tool [g:Profiler](https://biit.cs.ut.ee/gprofiler) with **get_short_link()**
----
## Accessing archived versions or the beta release with **set_base_url()**
## Accessing archived versions or the beta release with `set_base_url`
You can change the underlying tool version to beta with:
```{r}
set_base_url("http://biit.cs.ut.ee/gprofiler_beta")
```
......@@ -436,7 +442,7 @@ Similarly, for the [archived versions](https://biit.cs.ut.ee/gprofiler/page/arch
set_base_url("http://biit.cs.ut.ee/gprofiler_archive3/e95_eg42_p13")
```
Note that gprofiler2 package is only compatible with versions *e94_eg41_p11* and higher.
Note that gprofiler2 package is only compatible with versions *e94\_eg41\_p11* and higher.
----
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment