R Code Compilation - All Chapters
# Chapter 1 - Getting Started
print("Hello, R for Bioinformatics!")
5 + 5
my_first_variable <- 42
my_first_variable
?mean
values <- c(10, 15, 20, 25, 30)
mean_value <- mean(values)
print(paste("The mean value is:", mean_value))
plot(1:10, main="My First Plot", xlab="X axis", ylab="Y axis", type="o",
col="blue")
x <- 10; y <- 20; z = 30; Name <- "John"
# Chapter 2 - Data Structures
gene_lengths <- c(1500, 2300, 850, 4100)
print(gene_lengths)
name <- "jane"; print(Name); print(name)
sqrt(25); round(3.14159, digits=2)
count <- 42; message <- "Hello, Bioinformatics!"; is_active <- TRUE
print(count); print(message); print(is_active)
class(count); class(message); class(is_active)
count <- count + 8; print(count)
gene_count <- 100; proteinMass <- 45.6; sample_1 <- "ATCG"
weight <- 65.5; class(weight)
count <- 42L; class(count)
dna_sequence <- "ATGCAATAGCATCGAT"; class(dna_sequence)
is_significant <- TRUE; class(is_significant)
complex_number <- 3 + 4i; class(complex_number)
age_str <- "25"; age_num <- as.numeric(age_str)
print(age_num); class(age_num)
temp <- 37.5; temp_str <- as.character(temp)
print(temp_str); class(temp_str)
logical_vector <- c(TRUE, FALSE, TRUE)
numeric_vector <- as.numeric(logical_vector); print(numeric_vector)
measurements <- c(10.2, NA, 12.5, 14.1); print(measurements)
mean(measurements); mean(measurements, na.rm=TRUE)
empty_var <- NULL; print(empty_var); length(empty_var)
pos_inf <- 1/0; print(pos_inf)
neg_inf <- -1/0; print(neg_inf)
undefined <- 0/0; print(undefined)
# Arithmetic Operations
a <- 10; b <- 3
addition <- a + b; subtraction <- a - b; multiplication <- a * b
division <- a / b; exponentiation <- a ^ b; modulus <- a %% b
int_division <- a %/% b
print(paste("Addition:", addition))
print(paste("Subtraction:", subtraction))
print(paste("Multiplication:", multiplication))
print(paste("Division:", division))
print(paste("Exponentiation:", exponentiation))
print(paste("Modulus:", modulus))
print(paste("Integer Division:", int_division))
result <- 2 + 3 * 4; print(result)
result_with_parentheses <- (2 + 3) * 4; print(result_with_parentheses)
# Comparison Operations
x <- 5; y <- 10
equal <- x == y; not_equal <- x != y; greater_than <- x > y
less_than <- x < y; greater_equal <- x >= y; less_equal <- x <= y
print(paste("Equal:", equal))
print(paste("Not Equal:", not_equal))
print(paste("Greater Than:", greater_than))
print(paste("Less Than:", less_than))
print(paste("Greater Than or Equal:", greater_equal))
print(paste("Less Than or Equal:", less_equal))
num_value <- 5; char_value <- "5"
print(num_value == char_value)
value <- 10; print(value == NA); print(is.na(value))
# Logical Operations
a <- TRUE; b <- FALSE; and_result <- a & b
or_result <- a | b; not_result <- !a
print(paste("AND:", and_result))
print(paste("OR:", or_result))
print(paste("NOT:", not_result))
vec1 <- c(TRUE, FALSE, TRUE); vec2 <- c(FALSE, FALSE, TRUE)
and_vec <- vec1 & vec2; or_vec <- vec1 | vec2
print("Vectorized AND:"); print(and_vec)
print("Vectorized OR:"); print(or_vec)
first_elements_and <- vec1 && vec2; print(first_elements_and)
complex_condition <- (x > 0) & (y < 20); print(complex_condition)
# Mathematical Functions
num <- 16; sqrt_num <- sqrt(num); abs_neg <- abs(-10)
log_num <- log(10); log10_num <- log10(100); exp_num <- exp(1)
print(paste("Square root of", num, ":", sqrt_num))
print(paste("Absolute value of -10:", abs_neg))
print(paste("Natural log of 10:", log_num))
print(paste("Log base 10 of 100:", log10_num))
print(paste("e^1:", exp_num))
# Statistical Functions
data <- c(4, 7, 12, 5, 8, 15, 9)
mean_val <- mean(data); median_val <- median(data); sd_val <- sd(data)
min_val <- min(data); max_val <- max(data); sum_val <- sum(data)
print(paste("Mean:", mean_val))
print(paste("Median:", median_val))
print(paste("Standard Deviation:", sd_val))
print(paste("Minimum:", min_val))
print(paste("Maximum:", max_val))
print(paste("Sum:", sum_val))
# String Functions
first_name <- "john"; last_name <- "doe"
full_name <- paste(first_name, last_name); print(full_name)
full_name_comma <- paste(last_name, first_name, sep=", ");
print(full_name_comma)
upper_name <- toupper(full_name); lower_name <- tolower("HELLO WORLD")
print(upper_name); print(lower_name)
substr_result <- substr("ATGCAATAGCATCGAT", start=4, stop=8);
print(substr_result)
# Utility Functions
vec <- c(10, 20, NA, 40, 50)
vec_length <- length(vec); has_na <- is.na(vec); vec_class <- class(vec)
print(paste("Vector length:", vec_length))
print("NA positions:"); print(has_na)
print(paste("Vector class:", vec_class))
pi_value <- pi; print(pi_value)
rounded_pi <- round(pi_value); print(rounded_pi)
rounded_pi_2 <- round(pi_value, digits=2); print(rounded_pi_2)
mean_result1 <- mean(x=c(1, 2, 3, NA), na.rm=TRUE)
mean_result2 <- mean(na.rm=TRUE, x=c(1, 2, 3, NA))
print(mean_result1); print(mean_result2)
mean_result3 <- mean(c(1, 2, 3, NA), TRUE); print(mean_result3)
# Vectors
class(gene_lengths)
gene_names <- c("BRCA1", "TP53", "EGFR", "MYC"); print(gene_names);
class(gene_names)
is_significant <- c(TRUE, FALSE, TRUE, TRUE); print(is_significant);
class(is_significant)
exon_counts <- c(24L, 11L, 28L, 3L); print(exon_counts); class(exon_counts)
sequence_1_to_10 <- 1:10; print(sequence_1_to_10)
sequence_by_step <- seq(from=0, to=1, by=0.2); print(sequence_by_step)
sequence_fixed_length <- seq(from=1, to=100, length.out=5);
print(sequence_fixed_length)
repeated_values <- rep("A", times=5); print(repeated_values)
repeated_pattern <- rep(c(1, 2), times=3); print(repeated_pattern)
repeated_each <- rep(c("Control", "Treated"), each=3); print(repeated_each)
mixed_vector_1 <- c(1, "two", 3); print(mixed_vector_1); class(mixed_vector_1)
mixed_vector_2 <- c(10.5, TRUE, FALSE); print(mixed_vector_2);
class(mixed_vector_2)
mixed_vector_3 <- c(5L, TRUE, 10L, FALSE); print(mixed_vector_3);
class(mixed_vector_3)
# Vector Operations
vec1 <- c(1, 2, 3, 4); vec2 <- c(10, 20, 30, 40)
sum_vec <- vec1 + vec2; print(sum_vec)
product_vec <- vec1 * 5; print(product_vec)
comparison_vec <- vec1 > 2; print(comparison_vec)
sqrt_vec <- sqrt(vec1); print(sqrt_vec)
vec_short <- c(10, 20); vec_long <- c(1, 2, 3, 4, 5, 6)
sum_recycled <- vec_long + vec_short; print(sum_recycled)
# Vector Indexing
gene_expression <- c(15.2, 8.1, 22.5, 19.0, 7.5)
names(gene_expression) <- c("GeneA", "GeneB", "GeneC", "GeneD", "GeneE")
print(gene_expression)
first_element <- gene_expression[1]; print(first_element)
third_and_fifth <- gene_expression[c(3, 5)]; print(third_and_fifth)
all_except_second <- gene_expression[-2]; print(all_except_second)
all_except_first_and_fourth <- gene_expression[c(-1, -4)];
print(all_except_first_and_fourth)
low_expression_indices <- gene_expression < 10; print(low_expression_indices)
low_expression_values <- gene_expression[low_expression_indices];
print(low_expression_values)
high_expression_values <- gene_expression[gene_expression > 18];
print(high_expression_values)
gene_c_value <- gene_expression["GeneC"]; print(gene_c_value)
genes_b_and_d <- gene_expression[c("GeneB", "GeneD")]; print(genes_b_and_d)
gene_expression[1] <- 16.0; gene_expression["GeneE"] <- 8.0
gene_expression[gene_expression < 10] <- 10.0; print(gene_expression)
# Matrices
matrix_data <- 1:12
my_matrix <- matrix(data = matrix_data, nrow = 3, ncol = 4); print(my_matrix)
my_matrix_byrow <- matrix(data = matrix_data, nrow = 3, ncol = 4, byrow =
TRUE); print(my_matrix_byrow)
matrix_direct <- matrix(c(5, 10, 15, 20, 25, 30), nrow=2); print(matrix_direct)
rownames(my_matrix) <- c("Sample1", "Sample2", "Sample3")
colnames(my_matrix) <- c("GeneA", "GeneB", "GeneC", "GeneD"); print(my_matrix)
dim(my_matrix); nrow(my_matrix); ncol(my_matrix)
matrix1 <- matrix(1:6, nrow=2); matrix2 <- matrix(10:15, nrow=2)
print("Matrix 1:"); print(matrix1); print("Matrix 2:"); print(matrix2)
matrix_sum <- matrix1 + matrix2; print("Element-wise Sum:"); print(matrix_sum)
matrix_product_elementwise <- matrix1 * matrix2; print("Element-wise
Product:"); print(matrix_product_elementwise)
matrix_scaled <- matrix1 * 10; print("Scaled Matrix 1:"); print(matrix_scaled)
matrix_a <- matrix(1:6, nrow=2, ncol=3); matrix_b <- matrix(1:6, nrow=3,
ncol=2)
print("Matrix A (2x3):"); print(matrix_a); print("Matrix B (3x2):");
print(matrix_b)
matrix_product_matmul <- matrix_a %*% matrix_b; print("Matrix Multiplication (A
%*% B):"); print(matrix_product_matmul)
transposed_matrix_a <- t(matrix_a); print("Transposed Matrix A (3x2):");
print(transposed_matrix_a)
# Matrix Indexing
print(my_matrix)
element_2_3 <- my_matrix[2, 3]; print(element_2_3)
row_2 <- my_matrix[2, ]; print(row_2); class(row_2)
col_3 <- my_matrix[, 3]; print(col_3); class(col_3)
row_2_matrix <- my_matrix[2, , drop=FALSE]; print(row_2_matrix);
class(row_2_matrix)
col_3_matrix <- my_matrix[, 3, drop=FALSE]; print(col_3_matrix);
class(col_3_matrix)
sub_matrix <- my_matrix[c(1, 3), c(2, 4)]; print(sub_matrix)
sample1_gene_b <- my_matrix["Sample1", "GeneB"]; print(sample1_gene_b)
sample2_data <- my_matrix["Sample2", ]; print(sample2_data)
matrix_subset_logical <- my_matrix[my_matrix[, "GeneA"] > 1, ];
print(matrix_subset_logical)
my_matrix[1, 1] <- 100; my_matrix["Sample3", "GeneC"] <- 999; my_matrix[,
"GeneD"] <- 0; print(my_matrix)
# Lists
patient_info <- list(
patient_id = "P123", age = 45L, diagnosis = "Type 2 Diabetes", is_smoker =
TRUE,
lab_values = c(glucose = 150, hba1c = 7.5, cholesterol = 210),
medications = c("Metformin", "Lisinopril")
)
print(patient_info); str(patient_info)
complex_list <- list(
info = "Experiment Details", samples = c("S1", "S2", "S3"),
expression_matrix = matrix(rnorm(9), nrow=3, dimnames=list(paste0("G", 1:3),
paste0("S", 1:3))),
metadata = patient_info
)
print(complex_list); str(complex_list)
# List Indexing
print(patient_info)
patient_id_value <- patient_info[[1]]; print(patient_id_value);
class(patient_id_value)
diagnosis_value <- patient_info[["diagnosis"]]; print(diagnosis_value);
class(diagnosis_value)
age_value <- patient_info$age; print(age_value); class(age_value)
lab_values_vector <- patient_info$lab_values; print(lab_values_vector);
class(lab_values_vector)
first_two_elements_list <- patient_info[1:2]; print(first_two_elements_list);
class(first_two_elements_list)
medications_list <- patient_info["medications"]; print(medications_list);
class(medications_list)
first_medication <- patient_info$medications[1]; print(first_medication)
glucose_value <- patient_info$lab_values["glucose"]; print(glucose_value)
print(complex_list$`metadata`$patient_id); print(complex_list[["metadata"]]
[["age"]])
patient_info$age <- 46L; patient_info[["is_smoker"]] <- FALSE
patient_info$`medications <- c(patient_info`$medications, "Aspirin")
patient_info$new_field <- "Some new information"; print(patient_info)
patient_info$new_field <- NULL; print(patient_info)
# Data Frames
gene_ids <- c("Gene1", "Gene2", "Gene3", "Gene4")
chromosome <- c("chr1", "chr2", "chr1", "chrX")
expression_level <- c(5.2, 1.8, 7.1, 4.5)
is_expressed <- c(TRUE, TRUE, TRUE, TRUE)
gene_data_df <- data.frame(GeneID = gene_ids, Chromosome = chromosome,
Expression = expression_level, Expressed = is_expressed)
print(gene_data_df); class(gene_data_df); str(gene_data_df)
gene_data_df_no_factors <- data.frame(GeneID = gene_ids, Chromosome =
chromosome, Expression = expression_level, Expressed = is_expressed,
stringsAsFactors = FALSE)
str(gene_data_df_no_factors)
# Tibbles
library(tibble)
gene_data_tbl <- tibble(GeneID = gene_ids, Chromosome = chromosome, Expression
= expression_level, Expressed = is_expressed)
print(gene_data_tbl); class(gene_data_tbl); str(gene_data_tbl)
element_2_3_df <- gene_data_tbl[2, 3]; print(element_2_3_df)
row_1_df <- gene_data_tbl[1, ]; print(row_1_df); class(row_1_df)
expression_col_matrix_style <- gene_data_tbl[, "Expression"];
print(expression_col_matrix_style); class(expression_col_matrix_style)
expression_col_list_style <- gene_data_tbl$Expression;
print(expression_col_list_style); class(expression_col_list_style)
chromosome_col_list_style <- gene_data_tbl[["Chromosome"]];
print(chromosome_col_list_style); class(chromosome_col_list_style)
expressed_high <- gene_data_tbl[gene_data_tbl$Expression > 5, ];
print(expressed_high)
chr1_genes <- gene_data_tbl[gene_data_tbl$Chromosome == "chr1", ];
print(chr1_genes)
gene_expression_subset <- gene_data_tbl[, c("GeneID", "Expression")];
print(gene_expression_subset)
gene_expression_subset_list <- gene_data_tbl[c("GeneID", "Expression")];
print(gene_expression_subset_list)
gene_data_tbl$`Expression[1] <- 6.0; gene_data_tbl`$LogExpression <-
log2(gene_data_tbl$Expression); print(gene_data_tbl)
gene_data_tbl$Expressed <- NULL; print(gene_data_tbl)
# Chapter 3 - Programming Fundamentals
gc_content <- 0.65
if (gc_content > 0.6) { print("High GC content detected.") }
expression_level <- 4.5
if (expression_level > 5.0) { print("Gene is highly expressed.") } else {
print("Gene expression is not high.") }
sequence_length <- 1500
if (sequence_length < 100) { print("Short sequence.") } else if
(sequence_length < 1000) { print("Medium sequence.") } else { print("Long
sequence.") }
expression_values <- c(2.1, 8.5, 4.9, 12.3, 1.5); threshold <- 5.0
classification <- ifelse(expression_values > threshold, "High", "Low");
print(classification)
p_values <- c(0.01, 0.25, 0.04, 0.001, 0.5); significance_level <- 0.05
# dplyr Operations
library(dplyr)
data(iris); iris_tbl <- as_tibble(iris); print(iris_tbl)
setosa_data <- filter(iris_tbl, Species == "setosa"); print(setosa_data)
versicolor_long_sepal <- filter(iris_tbl, Species == "versicolor", Sepal.Length
> 6); print(versicolor_long_sepal)
sorted_by_petal_length <- arrange(iris_tbl, desc(Petal.Length));
print(head(sorted_by_petal_length))
sorted_multi <- arrange(iris_tbl, Species, Sepal.Width);
print(head(sorted_multi))
sepal_species_data <- select(iris_tbl, Species, Sepal.Length);
print(head(sepal_species_data))
no_species_data <- select(iris_tbl, -Species); print(head(no_species_data))
petal_data <- select(iris_tbl, starts_with("Petal")); print(head(petal_data))
iris_with_area <- mutate(iris_tbl, Petal.Area = Petal.Length * Petal.Width)
print(head(select(iris_with_area, Petal.Length, Petal.Width, Petal.Area)))
iris_transformed <- mutate(iris_tbl, Sepal.Length.cm = Sepal.Length / 10,
Petal.Length.cm = Petal.Length / 10)
print(head(iris_transformed))
summary_petal_length <- summarise(iris_tbl, avg_petal_length =
mean(Petal.Length)); print(summary_petal_length)
summary_stats <- summarise(iris_tbl, avg_sepal_width = mean(Sepal.Width),
max_petal_length = max(Petal.Length), n_observations = n())
print(summary_stats)
species_summary <- group_by(iris_tbl, Species); print(species_summary)
species_avg_stats <- summarise(species_summary, avg_sepal_length =
mean(Sepal.Length), avg_sepal_width = mean(Sepal.Width), count = n())
print(species_avg_stats)
species_avg_stats_ungrouped <- ungroup(species_avg_stats);
print(species_avg_stats_ungrouped)
result_nested <- summarise(filter(mutate(iris_tbl, Petal.Area = Petal.Length *
Petal.Width), Species == "virginica", Sepal.Length > 7), avg_petal_area =
mean(Petal.Area))
print(result_nested)
# Chapter 4 - Bioinformatics
BiocManager::install(c("Biostrings", "GenomicRanges", "rtracklayer",
"SummarizedExperiment"))
library(Biostrings)
dna_seq <- DNAString("ATGCAATAGCATCGAT"); print(dna_seq); length(dna_seq)
complement_seq <- complement(dna_seq); print(complement_seq)
reverse_complement_seq <- reverseComplement(dna_seq);
print(reverse_complement_seq)
gc_content_seq <- letterFrequency(dna_seq, "GC", as.prob=TRUE);
print(gc_content_seq)
dna_set <- DNAStringSet(c("ATGCAATAGCATCGAT", "GCTAGCTAGCTA", "AAATTTCCCGGG"))
names(dna_set) <- c("Seq1", "Seq2", "Seq3"); print(dna_set)
writeXStringSet(dna_set, "example_sequences.fasta")
loaded_sequences <- readDNAStringSet("example_sequences.fasta");
print(loaded_sequences)
# GenomicRanges
library(GenomicRanges)
gr <- GRanges(
seqnames = Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)),
ranges = IRanges(start=c(101, 201, 401, 501, 601, 701, 801, 901, 1001, 1101),
end=c(110, 250, 450, 550, 650, 750, 850, 950, 1050, 1150)),
strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
score = 1:10, GC = seq(0.4, 0.58, length.out=10)
)
print(gr); seqnames(gr); ranges(gr); start(gr); end(gr); width(gr); strand(gr)
mcols(gr); gr$`score; gr`$GC; length(gr)
first_three_ranges <- gr[1:3]; print(first_three_ranges)
chr1_ranges <- gr[seqnames(gr) == "chr1"]; print(chr1_ranges)
high_score_ranges <- gr[gr$score > 7]; print(high_score_ranges)
shifted_gr <- shift(gr, shift = 1000); print(shifted_gr)
resized_gr_start <- resize(gr, width = 20, fix = "start");
print(resized_gr_start)
resized_gr_end <- resize(gr, width = 100, fix = "end"); print(resized_gr_end)
gr_query <- GRanges("chr1", IRanges(c(120, 360), c(180, 410)), score=c(50, 80))
print("Query Ranges:"); print(gr_query); print("Subject Ranges (gr):");
print(gr)
overlaps <- findOverlaps(gr_query, gr); print("Overlap results (indices):");
print(overlaps)
query_hits <- gr_query[queryHits(overlaps)]; subject_hits <-
gr[subjectHits(overlaps)]
print("Query ranges that overlap:"); print(query_hits); print("Subject ranges
that overlap:"); print(subject_hits)
subset_by_overlap <- subsetByOverlaps(gr, gr_query); print("Ranges in gr that
overlap with gr_query:"); print(subset_by_overlap)
nearest_indices <- nearest(gr_query, gr); print("Index of nearest range in gr
for each range in gr_query:"); print(nearest_indices);
print(gr[nearest_indices])
distances <- distanceToNearest(gr_query, gr); print("Distances to nearest
ranges:"); print(distances)
gr_a <- GRanges("chr1", IRanges(c(100, 200), c(150, 250))); gr_b <-
GRanges("chr1", IRanges(c(130, 260), c(180, 300)))
union_gr <- union(gr_a, gr_b); intersect_gr <- intersect(gr_a, gr_b);
setdiff_gr <- setdiff(gr_a, gr_b)
print("Union:"); print(union_gr); print("Intersect:"); print(intersect_gr);
print("Set Difference (A-B):"); print(setdiff_gr)
coverage_chr1 <- coverage(gr[seqnames(gr) == "chr1"]); print(coverage_chr1)
# rtracklayer
library(rtracklayer)
bed_content <- c("chr1\t100\t150\tGeneA\t10\t+", "chr1\t300\t380\tGeneB\t25\t-
", "chr2\t200\t250\tGeneC\t15\t+")
writeLines(bed_content, "example.bed"); bed_gr <- import("example.bed", format
= "bed"); print(bed_gr)
gtf_content <- c("chr1\tHAVANA\texon\t11869\t12227\t.\t+\t.\tgene_id
\"ENSG001\"; transcript_id \"ENST001\"; exon_number 1;",
"chr1\tHAVANA\texon\t12613\t12721\t.\t+\t.\tgene_id \"ENSG001\"; transcript_id
\"ENST001\"; exon_number 2;",
"chr1\tHAVANA\tgene\t11869\t14409\t.\t+\t.\tgene_id \"ENSG001\"; gene_name
\"DDX11L1\";", "chr2\tHAVANA\texon\t5000\t5500\t.\t-\t.\tgene_id \"ENSG002\";
transcript_id \"ENST002\"; exon_number 1;")
writeLines(gtf_content, "example.gtf"); gtf_gr <- import("example.gtf", format
= "gtf"); print(gtf_gr); print(mcols(gtf_gr))
exons_gr <- gtf_gr[gtf_gr$type == "exon"]; print(exons_gr)
exons_gr$`score <- sample(1:100, length(exons_gr), replace=TRUE);
exons_gr`$name <- paste0("Exon_", 1:length(exons_gr))
export(exons_gr, "exported_exons.bed", format = "bed")
# SummarizedExperiment
library(SummarizedExperiment)
counts_matrix <- matrix(rpois(100, lambda = 10), nrow = 20, ncol = 5, dimnames
= list(paste0("Gene", 1:20), paste0("Sample", 1:5)))
print(head(counts_matrix))
feature_data <- GRanges(seqnames = sample(c("chr1", "chr2"), 20, replace =
TRUE), ranges = IRanges(start = sample(1:10000, 20), width = sample(50:500,
20)), strand = sample(c("+", "-"), 20, replace = TRUE), gene_id =
paste0("Gene", 1:20), gc_content = rnorm(20, 0.5, 0.05))
names(feature_data) <- rownames(counts_matrix); print(feature_data)
sample_data <- DataFrame(Treatment = rep(c("Control", "Treated"), length.out =
5), Replicate = c(1, 1, 2, 2, 3), row.names = colnames(counts_matrix))
print(sample_data)
se <- SummarizedExperiment(assays = list(counts = counts_matrix), rowData =
feature_data, colData = sample_data); print(se)
assay(se); assays(se); rowData(se); colData(se); dim(se); nrow(se); ncol(se)
se_subset_genes <- se[1:5, ]; print(se_subset_genes)
se_subset_samples <- se[, se$Treatment == "Treated"]; print(se_subset_samples)
se_subset_both <- se[rowData(se)$`gc_content > 0.55, se`$Replicate == 1];
print(se_subset_both)
normalized_counts <- log2(counts_matrix + 1); assay(se, "logcounts") <-
normalized_counts; print(assays(se))
# Chapter 5 - Data Visualization
library(ggplot2); library(dplyr)
data(iris); iris_tbl <- as_tibble(iris)
base_plot <- ggplot(data = iris_tbl, mapping = aes(x = Sepal.Length, y =
Sepal.Width, color = Species))
scatter_plot <- base_plot + geom_point(); print(scatter_plot)
ggplot(data = iris_tbl, aes(x = Sepal.Length, y = Sepal.Width, color =
Species)) + geom_point()
set.seed(123); num_proteins <- 1000
proteomics_data <- tibble(ProteinID = paste0("Prot", 1:num_proteins),
Control_Abundance = rnorm(num_proteins, mean = 10, sd = 2), Treated_Abundance =
Control_Abundance + rnorm(num_proteins, mean = 0.5, sd = 1.5))
proteomics_data$`Treated_Abundance[1:50] <-
proteomics_data`$Treated_Abundance[1:50] + rnorm(50, mean = 3, sd = 1)
proteomics_data$`Treated_Abundance[51:100] <-
proteomics_data`$Treated_Abundance[51:100] - rnorm(50, mean = 3, sd = 1)
scatter_proteomics <- ggplot(proteomics_data, aes(x = Control_Abundance, y =
Treated_Abundance)) + geom_point(alpha = 0.5) + labs(title = "Protein
Abundance: Treated vs. Control", x = "Log2 Abundance (Control)", y = "Log2
Abundance (Treated)") + theme_minimal()
print(scatter_proteomics)
scatter_proteomics_diag <- scatter_proteomics + geom_abline(intercept = 0,
slope = 1, color = "red", linetype = "dashed"); print(scatter_proteomics_diag)
box_plot_iris <- ggplot(iris_tbl, aes(x = Species, y = Sepal.Width, fill =
Species)) + geom_boxplot() + labs(title = "Sepal Width Distribution by
Species", x = "Species", y = "Sepal Width") + theme_light()
print(box_plot_iris)
violin_plot_iris <- ggplot(iris_tbl, aes(x = Species, y = Petal.Length, fill =
Species)) + geom_violin(trim = FALSE, alpha = 0.7) + geom_jitter(width = 0.1,
height = 0, alpha = 0.5, size = 1.5) + labs(title = "Petal Length Distribution
by Species", x = "Species", y = "Petal Length") + theme_bw()
print(violin_plot_iris)
histogram_petal <- ggplot(iris_tbl, aes(x = Petal.Length)) +
geom_histogram(binwidth = 0.2, fill = "skyblue", color = "black") + labs(title
= "Histogram of Petal Lengths", x = "Petal Length", y = "Frequency")
print(histogram_petal)
density_petal_species <- ggplot(iris_tbl, aes(x = Petal.Length, fill = Species,
color = Species)) + geom_density(alpha = 0.5) + labs(title = "Density of Petal
Lengths by Species", x = "Petal Length", y = "Density")
print(density_petal_species)
bar_plot_counts <- ggplot(iris_tbl, aes(x = Species, fill = Species)) +
geom_bar() + labs(title = "Count of Observations per Species", x = "Species", y
= "Count")
print(bar_plot_counts)
iris_summary <- iris_tbl %>% group_by(Species) %>% summarise(Avg.Sepal.Length =
mean(Sepal.Length)); print(iris_summary)
bar_plot_means <- ggplot(iris_summary, aes(x = Species, y = Avg.Sepal.Length,
fill = Species)) + geom_col() + labs(title = "Average Sepal Length per
Species", x = "Species", y = "Average Sepal Length")
print(bar_plot_means)
set.seed(456)
heatmap_data_matrix <- matrix(rnorm(50 * 10, mean = 0, sd = 1.5), nrow = 50,
ncol = 10, dimnames = list(paste0("Prot", 1:50), paste0("Sample", 1:10)))
heatmap_data_matrix[, 1:5] <- heatmap_data_matrix[, 1:5] + 1;
heatmap_data_matrix[1:20, ] <- heatmap_data_matrix[1:20, ] + 1.5
library(tidyr); library(tibble)
heatmap_data_long <- as.data.frame(heatmap_data_matrix) %>%
rownames_to_column(var = "ProteinID") %>% pivot_longer(cols = -ProteinID,
names_to = "SampleID", values_to = "Log2FoldChange")
heatmap_plot <- ggplot(heatmap_data_long, aes(x = SampleID, y = ProteinID, fill
= Log2FoldChange)) + geom_tile(color = "white", size = 0.1) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
labs(title = "Protein Expression Heatmap", x = "Sample", y = "Protein", fill =
"Log2 FC") + theme_minimal() + theme(axis.text.y = element_text(size = 6),
axis.text.x = element_text(angle = 45, hjust = 1, size = 8), legend.position =
"bottom")
print(heatmap_plot)
set.seed(789); num_proteins_volcano <- 2000
volcano_data <- tibble(ProteinID = paste0("Prot", 1:num_proteins_volcano),
log2FoldChange = rnorm(num_proteins_volcano, mean = 0, sd = 1.5), p.value =
runif(num_proteins_volcano, min = 0, max = 1))
up_indices <- 1:100; volcano_data$`log2FoldChange[up_indices] <-
rnorm(length(up_indices), mean = 3, sd = 1); volcano_data`$p.value[up_indices]
<- rbeta(length(up_indices), 1, 50)
down_indices <- 101:200; volcano_data$`log2FoldChange[down_indices] <-
rnorm(length(down_indices), mean = -3, sd = 1);
volcano_data`$p.value[down_indices] <- rbeta(length(down_indices), 1, 50)
volcano_data <- volcano_data %>% mutate(negLog10p = -log10(p.value),
Significance = case_when(log2FoldChange > 1.5 & p.value < 0.01 ~ "Upregulated",
log2FoldChange < -1.5 & p.value < 0.01 ~ "Downregulated", TRUE ~ "Not
Significant"), Significance = factor(Significance, levels = c("Upregulated",
"Downregulated", "Not Significant")))
volcano_plot <- ggplot(volcano_data, aes(x = log2FoldChange, y = negLog10p,
color = Significance)) + geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue",
"Not Significant" = "grey")) + labs(title = "Volcano Plot of Differential
Protein Abundance", x = "Log2 Fold Change", y = "-Log10 P-value") +
theme_minimal() + theme(legend.position = "bottom") + geom_hline(yintercept = -
log10(0.01), linetype = "dashed", color = "darkgrey") + geom_vline(xintercept =
c(-1.5, 1.5), linetype = "dashed", color = "darkgrey")
# Chapter 6 - Shiny
library(shiny); library(ggplot2)
ui <- fluidPage(titlePanel("Old Faithful Geyser Data Histogram"),
sidebarLayout(sidebarPanel(sliderInput(inputId = "bins", label = "Number of
bins:", min = 1, max = 50, value = 30)), mainPanel(plotOutput(outputId =
"distPlot"))))
server <- function(input, output) { output$`distPlot <- renderPlot({ x <-
faithful[, 2]; bins <- seq(min(x), max(x), length.out = input`$bins + 1);
hist(x, breaks = bins, col = "darkgray", border = "white", xlab = "Waiting time
to next eruption (in mins)", main = "Histogram of waiting times") }) }
shinyApp(ui = ui, server = server)
# Chapter 7 - Performance
slow_function <- function(n) { result <- 0; for (i in 1:n) { result <- result +
log(sqrt(i^2.5)) }; Sys.sleep(0.1); return(result) }
execution_time <- system.time({ result_single <- slow_function(10000) });
print(execution_time)
num_reps <- 10
system.time({ results_sequential <- vector("list", num_reps); for(i in
1:num_reps) { results_sequential[[i]] <- slow_function(5000) } })