0% found this document useful (0 votes)
3 views10 pages

R Code Compact

The document provides a comprehensive overview of R programming for bioinformatics, covering various topics such as data structures, arithmetic operations, logical operations, statistical functions, string functions, and data frames. It includes practical examples and code snippets for each topic, demonstrating how to manipulate data, perform calculations, and visualize results. Additionally, it introduces advanced data structures like lists and tibbles, and explains their usage in data analysis.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views10 pages

R Code Compact

The document provides a comprehensive overview of R programming for bioinformatics, covering various topics such as data structures, arithmetic operations, logical operations, statistical functions, string functions, and data frames. It includes practical examples and code snippets for each topic, demonstrating how to manipulate data, perform calculations, and visualize results. Additionally, it introduces advanced data structures like lists and tibbles, and explains their usage in data analysis.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 10

R Code Compilation - All Chapters

# Chapter 1 - Getting Started


print("Hello, R for Bioinformatics!")
5 + 5
my_first_variable <- 42
my_first_variable
?mean
values <- c(10, 15, 20, 25, 30)
mean_value <- mean(values)
print(paste("The mean value is:", mean_value))
plot(1:10, main="My First Plot", xlab="X axis", ylab="Y axis", type="o",
col="blue")
x <- 10; y <- 20; z = 30; Name <- "John"

# Chapter 2 - Data Structures


gene_lengths <- c(1500, 2300, 850, 4100)
print(gene_lengths)
name <- "jane"; print(Name); print(name)
sqrt(25); round(3.14159, digits=2)
count <- 42; message <- "Hello, Bioinformatics!"; is_active <- TRUE
print(count); print(message); print(is_active)
class(count); class(message); class(is_active)
count <- count + 8; print(count)
gene_count <- 100; proteinMass <- 45.6; sample_1 <- "ATCG"
weight <- 65.5; class(weight)
count <- 42L; class(count)
dna_sequence <- "ATGCAATAGCATCGAT"; class(dna_sequence)
is_significant <- TRUE; class(is_significant)
complex_number <- 3 + 4i; class(complex_number)
age_str <- "25"; age_num <- as.numeric(age_str)
print(age_num); class(age_num)
temp <- 37.5; temp_str <- as.character(temp)
print(temp_str); class(temp_str)
logical_vector <- c(TRUE, FALSE, TRUE)
numeric_vector <- as.numeric(logical_vector); print(numeric_vector)
measurements <- c(10.2, NA, 12.5, 14.1); print(measurements)
mean(measurements); mean(measurements, na.rm=TRUE)
empty_var <- NULL; print(empty_var); length(empty_var)
pos_inf <- 1/0; print(pos_inf)
neg_inf <- -1/0; print(neg_inf)
undefined <- 0/0; print(undefined)

# Arithmetic Operations
a <- 10; b <- 3
addition <- a + b; subtraction <- a - b; multiplication <- a * b
division <- a / b; exponentiation <- a ^ b; modulus <- a %% b
int_division <- a %/% b
print(paste("Addition:", addition))
print(paste("Subtraction:", subtraction))
print(paste("Multiplication:", multiplication))
print(paste("Division:", division))
print(paste("Exponentiation:", exponentiation))
print(paste("Modulus:", modulus))
print(paste("Integer Division:", int_division))
result <- 2 + 3 * 4; print(result)
result_with_parentheses <- (2 + 3) * 4; print(result_with_parentheses)

# Comparison Operations
x <- 5; y <- 10
equal <- x == y; not_equal <- x != y; greater_than <- x > y
less_than <- x < y; greater_equal <- x >= y; less_equal <- x <= y
print(paste("Equal:", equal))
print(paste("Not Equal:", not_equal))
print(paste("Greater Than:", greater_than))
print(paste("Less Than:", less_than))
print(paste("Greater Than or Equal:", greater_equal))
print(paste("Less Than or Equal:", less_equal))
num_value <- 5; char_value <- "5"
print(num_value == char_value)
value <- 10; print(value == NA); print(is.na(value))

# Logical Operations
a <- TRUE; b <- FALSE; and_result <- a & b
or_result <- a | b; not_result <- !a
print(paste("AND:", and_result))
print(paste("OR:", or_result))
print(paste("NOT:", not_result))
vec1 <- c(TRUE, FALSE, TRUE); vec2 <- c(FALSE, FALSE, TRUE)
and_vec <- vec1 & vec2; or_vec <- vec1 | vec2
print("Vectorized AND:"); print(and_vec)
print("Vectorized OR:"); print(or_vec)
first_elements_and <- vec1 && vec2; print(first_elements_and)
complex_condition <- (x > 0) & (y < 20); print(complex_condition)

# Mathematical Functions
num <- 16; sqrt_num <- sqrt(num); abs_neg <- abs(-10)
log_num <- log(10); log10_num <- log10(100); exp_num <- exp(1)
print(paste("Square root of", num, ":", sqrt_num))
print(paste("Absolute value of -10:", abs_neg))
print(paste("Natural log of 10:", log_num))
print(paste("Log base 10 of 100:", log10_num))
print(paste("e^1:", exp_num))

# Statistical Functions
data <- c(4, 7, 12, 5, 8, 15, 9)
mean_val <- mean(data); median_val <- median(data); sd_val <- sd(data)
min_val <- min(data); max_val <- max(data); sum_val <- sum(data)
print(paste("Mean:", mean_val))
print(paste("Median:", median_val))
print(paste("Standard Deviation:", sd_val))
print(paste("Minimum:", min_val))
print(paste("Maximum:", max_val))
print(paste("Sum:", sum_val))

# String Functions
first_name <- "john"; last_name <- "doe"
full_name <- paste(first_name, last_name); print(full_name)
full_name_comma <- paste(last_name, first_name, sep=", ");
print(full_name_comma)
upper_name <- toupper(full_name); lower_name <- tolower("HELLO WORLD")
print(upper_name); print(lower_name)
substr_result <- substr("ATGCAATAGCATCGAT", start=4, stop=8);
print(substr_result)

# Utility Functions
vec <- c(10, 20, NA, 40, 50)
vec_length <- length(vec); has_na <- is.na(vec); vec_class <- class(vec)
print(paste("Vector length:", vec_length))
print("NA positions:"); print(has_na)
print(paste("Vector class:", vec_class))
pi_value <- pi; print(pi_value)
rounded_pi <- round(pi_value); print(rounded_pi)
rounded_pi_2 <- round(pi_value, digits=2); print(rounded_pi_2)
mean_result1 <- mean(x=c(1, 2, 3, NA), na.rm=TRUE)
mean_result2 <- mean(na.rm=TRUE, x=c(1, 2, 3, NA))
print(mean_result1); print(mean_result2)
mean_result3 <- mean(c(1, 2, 3, NA), TRUE); print(mean_result3)

# Vectors
class(gene_lengths)
gene_names <- c("BRCA1", "TP53", "EGFR", "MYC"); print(gene_names);
class(gene_names)
is_significant <- c(TRUE, FALSE, TRUE, TRUE); print(is_significant);
class(is_significant)
exon_counts <- c(24L, 11L, 28L, 3L); print(exon_counts); class(exon_counts)
sequence_1_to_10 <- 1:10; print(sequence_1_to_10)
sequence_by_step <- seq(from=0, to=1, by=0.2); print(sequence_by_step)
sequence_fixed_length <- seq(from=1, to=100, length.out=5);
print(sequence_fixed_length)
repeated_values <- rep("A", times=5); print(repeated_values)
repeated_pattern <- rep(c(1, 2), times=3); print(repeated_pattern)
repeated_each <- rep(c("Control", "Treated"), each=3); print(repeated_each)
mixed_vector_1 <- c(1, "two", 3); print(mixed_vector_1); class(mixed_vector_1)
mixed_vector_2 <- c(10.5, TRUE, FALSE); print(mixed_vector_2);
class(mixed_vector_2)
mixed_vector_3 <- c(5L, TRUE, 10L, FALSE); print(mixed_vector_3);
class(mixed_vector_3)

# Vector Operations
vec1 <- c(1, 2, 3, 4); vec2 <- c(10, 20, 30, 40)
sum_vec <- vec1 + vec2; print(sum_vec)
product_vec <- vec1 * 5; print(product_vec)
comparison_vec <- vec1 > 2; print(comparison_vec)
sqrt_vec <- sqrt(vec1); print(sqrt_vec)
vec_short <- c(10, 20); vec_long <- c(1, 2, 3, 4, 5, 6)
sum_recycled <- vec_long + vec_short; print(sum_recycled)

# Vector Indexing
gene_expression <- c(15.2, 8.1, 22.5, 19.0, 7.5)
names(gene_expression) <- c("GeneA", "GeneB", "GeneC", "GeneD", "GeneE")
print(gene_expression)
first_element <- gene_expression[1]; print(first_element)
third_and_fifth <- gene_expression[c(3, 5)]; print(third_and_fifth)
all_except_second <- gene_expression[-2]; print(all_except_second)
all_except_first_and_fourth <- gene_expression[c(-1, -4)];
print(all_except_first_and_fourth)
low_expression_indices <- gene_expression < 10; print(low_expression_indices)
low_expression_values <- gene_expression[low_expression_indices];
print(low_expression_values)
high_expression_values <- gene_expression[gene_expression > 18];
print(high_expression_values)
gene_c_value <- gene_expression["GeneC"]; print(gene_c_value)
genes_b_and_d <- gene_expression[c("GeneB", "GeneD")]; print(genes_b_and_d)
gene_expression[1] <- 16.0; gene_expression["GeneE"] <- 8.0
gene_expression[gene_expression < 10] <- 10.0; print(gene_expression)

# Matrices
matrix_data <- 1:12
my_matrix <- matrix(data = matrix_data, nrow = 3, ncol = 4); print(my_matrix)
my_matrix_byrow <- matrix(data = matrix_data, nrow = 3, ncol = 4, byrow =
TRUE); print(my_matrix_byrow)
matrix_direct <- matrix(c(5, 10, 15, 20, 25, 30), nrow=2); print(matrix_direct)
rownames(my_matrix) <- c("Sample1", "Sample2", "Sample3")
colnames(my_matrix) <- c("GeneA", "GeneB", "GeneC", "GeneD"); print(my_matrix)
dim(my_matrix); nrow(my_matrix); ncol(my_matrix)
matrix1 <- matrix(1:6, nrow=2); matrix2 <- matrix(10:15, nrow=2)
print("Matrix 1:"); print(matrix1); print("Matrix 2:"); print(matrix2)
matrix_sum <- matrix1 + matrix2; print("Element-wise Sum:"); print(matrix_sum)
matrix_product_elementwise <- matrix1 * matrix2; print("Element-wise
Product:"); print(matrix_product_elementwise)
matrix_scaled <- matrix1 * 10; print("Scaled Matrix 1:"); print(matrix_scaled)
matrix_a <- matrix(1:6, nrow=2, ncol=3); matrix_b <- matrix(1:6, nrow=3,
ncol=2)
print("Matrix A (2x3):"); print(matrix_a); print("Matrix B (3x2):");
print(matrix_b)
matrix_product_matmul <- matrix_a %*% matrix_b; print("Matrix Multiplication (A
%*% B):"); print(matrix_product_matmul)
transposed_matrix_a <- t(matrix_a); print("Transposed Matrix A (3x2):");
print(transposed_matrix_a)

# Matrix Indexing
print(my_matrix)
element_2_3 <- my_matrix[2, 3]; print(element_2_3)
row_2 <- my_matrix[2, ]; print(row_2); class(row_2)
col_3 <- my_matrix[, 3]; print(col_3); class(col_3)
row_2_matrix <- my_matrix[2, , drop=FALSE]; print(row_2_matrix);
class(row_2_matrix)
col_3_matrix <- my_matrix[, 3, drop=FALSE]; print(col_3_matrix);
class(col_3_matrix)
sub_matrix <- my_matrix[c(1, 3), c(2, 4)]; print(sub_matrix)
sample1_gene_b <- my_matrix["Sample1", "GeneB"]; print(sample1_gene_b)
sample2_data <- my_matrix["Sample2", ]; print(sample2_data)
matrix_subset_logical <- my_matrix[my_matrix[, "GeneA"] > 1, ];
print(matrix_subset_logical)
my_matrix[1, 1] <- 100; my_matrix["Sample3", "GeneC"] <- 999; my_matrix[,
"GeneD"] <- 0; print(my_matrix)

# Lists
patient_info <- list(
patient_id = "P123", age = 45L, diagnosis = "Type 2 Diabetes", is_smoker =
TRUE,
lab_values = c(glucose = 150, hba1c = 7.5, cholesterol = 210),
medications = c("Metformin", "Lisinopril")
)
print(patient_info); str(patient_info)
complex_list <- list(
info = "Experiment Details", samples = c("S1", "S2", "S3"),
expression_matrix = matrix(rnorm(9), nrow=3, dimnames=list(paste0("G", 1:3),
paste0("S", 1:3))),
metadata = patient_info
)
print(complex_list); str(complex_list)

# List Indexing
print(patient_info)
patient_id_value <- patient_info[[1]]; print(patient_id_value);
class(patient_id_value)
diagnosis_value <- patient_info[["diagnosis"]]; print(diagnosis_value);
class(diagnosis_value)
age_value <- patient_info$age; print(age_value); class(age_value)
lab_values_vector <- patient_info$lab_values; print(lab_values_vector);
class(lab_values_vector)
first_two_elements_list <- patient_info[1:2]; print(first_two_elements_list);
class(first_two_elements_list)
medications_list <- patient_info["medications"]; print(medications_list);
class(medications_list)
first_medication <- patient_info$medications[1]; print(first_medication)
glucose_value <- patient_info$lab_values["glucose"]; print(glucose_value)
print(complex_list$`metadata`$patient_id); print(complex_list[["metadata"]]
[["age"]])
patient_info$age <- 46L; patient_info[["is_smoker"]] <- FALSE
patient_info$`medications <- c(patient_info`$medications, "Aspirin")
patient_info$new_field <- "Some new information"; print(patient_info)
patient_info$new_field <- NULL; print(patient_info)

# Data Frames
gene_ids <- c("Gene1", "Gene2", "Gene3", "Gene4")
chromosome <- c("chr1", "chr2", "chr1", "chrX")
expression_level <- c(5.2, 1.8, 7.1, 4.5)
is_expressed <- c(TRUE, TRUE, TRUE, TRUE)
gene_data_df <- data.frame(GeneID = gene_ids, Chromosome = chromosome,
Expression = expression_level, Expressed = is_expressed)
print(gene_data_df); class(gene_data_df); str(gene_data_df)
gene_data_df_no_factors <- data.frame(GeneID = gene_ids, Chromosome =
chromosome, Expression = expression_level, Expressed = is_expressed,
stringsAsFactors = FALSE)
str(gene_data_df_no_factors)

# Tibbles
library(tibble)
gene_data_tbl <- tibble(GeneID = gene_ids, Chromosome = chromosome, Expression
= expression_level, Expressed = is_expressed)
print(gene_data_tbl); class(gene_data_tbl); str(gene_data_tbl)
element_2_3_df <- gene_data_tbl[2, 3]; print(element_2_3_df)
row_1_df <- gene_data_tbl[1, ]; print(row_1_df); class(row_1_df)
expression_col_matrix_style <- gene_data_tbl[, "Expression"];
print(expression_col_matrix_style); class(expression_col_matrix_style)
expression_col_list_style <- gene_data_tbl$Expression;
print(expression_col_list_style); class(expression_col_list_style)
chromosome_col_list_style <- gene_data_tbl[["Chromosome"]];
print(chromosome_col_list_style); class(chromosome_col_list_style)
expressed_high <- gene_data_tbl[gene_data_tbl$Expression > 5, ];
print(expressed_high)
chr1_genes <- gene_data_tbl[gene_data_tbl$Chromosome == "chr1", ];
print(chr1_genes)
gene_expression_subset <- gene_data_tbl[, c("GeneID", "Expression")];
print(gene_expression_subset)
gene_expression_subset_list <- gene_data_tbl[c("GeneID", "Expression")];
print(gene_expression_subset_list)
gene_data_tbl$`Expression[1] <- 6.0; gene_data_tbl`$LogExpression <-
log2(gene_data_tbl$Expression); print(gene_data_tbl)
gene_data_tbl$Expressed <- NULL; print(gene_data_tbl)

# Chapter 3 - Programming Fundamentals


gc_content <- 0.65
if (gc_content > 0.6) { print("High GC content detected.") }
expression_level <- 4.5
if (expression_level > 5.0) { print("Gene is highly expressed.") } else {
print("Gene expression is not high.") }
sequence_length <- 1500
if (sequence_length < 100) { print("Short sequence.") } else if
(sequence_length < 1000) { print("Medium sequence.") } else { print("Long
sequence.") }
expression_values <- c(2.1, 8.5, 4.9, 12.3, 1.5); threshold <- 5.0
classification <- ifelse(expression_values > threshold, "High", "Low");
print(classification)
p_values <- c(0.01, 0.25, 0.04, 0.001, 0.5); significance_level <- 0.05

# dplyr Operations
library(dplyr)
data(iris); iris_tbl <- as_tibble(iris); print(iris_tbl)
setosa_data <- filter(iris_tbl, Species == "setosa"); print(setosa_data)
versicolor_long_sepal <- filter(iris_tbl, Species == "versicolor", Sepal.Length
> 6); print(versicolor_long_sepal)
sorted_by_petal_length <- arrange(iris_tbl, desc(Petal.Length));
print(head(sorted_by_petal_length))
sorted_multi <- arrange(iris_tbl, Species, Sepal.Width);
print(head(sorted_multi))
sepal_species_data <- select(iris_tbl, Species, Sepal.Length);
print(head(sepal_species_data))
no_species_data <- select(iris_tbl, -Species); print(head(no_species_data))
petal_data <- select(iris_tbl, starts_with("Petal")); print(head(petal_data))
iris_with_area <- mutate(iris_tbl, Petal.Area = Petal.Length * Petal.Width)
print(head(select(iris_with_area, Petal.Length, Petal.Width, Petal.Area)))
iris_transformed <- mutate(iris_tbl, Sepal.Length.cm = Sepal.Length / 10,
Petal.Length.cm = Petal.Length / 10)
print(head(iris_transformed))
summary_petal_length <- summarise(iris_tbl, avg_petal_length =
mean(Petal.Length)); print(summary_petal_length)
summary_stats <- summarise(iris_tbl, avg_sepal_width = mean(Sepal.Width),
max_petal_length = max(Petal.Length), n_observations = n())
print(summary_stats)
species_summary <- group_by(iris_tbl, Species); print(species_summary)
species_avg_stats <- summarise(species_summary, avg_sepal_length =
mean(Sepal.Length), avg_sepal_width = mean(Sepal.Width), count = n())
print(species_avg_stats)
species_avg_stats_ungrouped <- ungroup(species_avg_stats);
print(species_avg_stats_ungrouped)
result_nested <- summarise(filter(mutate(iris_tbl, Petal.Area = Petal.Length *
Petal.Width), Species == "virginica", Sepal.Length > 7), avg_petal_area =
mean(Petal.Area))
print(result_nested)

# Chapter 4 - Bioinformatics
BiocManager::install(c("Biostrings", "GenomicRanges", "rtracklayer",
"SummarizedExperiment"))
library(Biostrings)
dna_seq <- DNAString("ATGCAATAGCATCGAT"); print(dna_seq); length(dna_seq)
complement_seq <- complement(dna_seq); print(complement_seq)
reverse_complement_seq <- reverseComplement(dna_seq);
print(reverse_complement_seq)
gc_content_seq <- letterFrequency(dna_seq, "GC", as.prob=TRUE);
print(gc_content_seq)
dna_set <- DNAStringSet(c("ATGCAATAGCATCGAT", "GCTAGCTAGCTA", "AAATTTCCCGGG"))
names(dna_set) <- c("Seq1", "Seq2", "Seq3"); print(dna_set)
writeXStringSet(dna_set, "example_sequences.fasta")
loaded_sequences <- readDNAStringSet("example_sequences.fasta");
print(loaded_sequences)

# GenomicRanges
library(GenomicRanges)
gr <- GRanges(
seqnames = Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)),
ranges = IRanges(start=c(101, 201, 401, 501, 601, 701, 801, 901, 1001, 1101),
end=c(110, 250, 450, 550, 650, 750, 850, 950, 1050, 1150)),
strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
score = 1:10, GC = seq(0.4, 0.58, length.out=10)
)
print(gr); seqnames(gr); ranges(gr); start(gr); end(gr); width(gr); strand(gr)
mcols(gr); gr$`score; gr`$GC; length(gr)
first_three_ranges <- gr[1:3]; print(first_three_ranges)
chr1_ranges <- gr[seqnames(gr) == "chr1"]; print(chr1_ranges)
high_score_ranges <- gr[gr$score > 7]; print(high_score_ranges)
shifted_gr <- shift(gr, shift = 1000); print(shifted_gr)
resized_gr_start <- resize(gr, width = 20, fix = "start");
print(resized_gr_start)
resized_gr_end <- resize(gr, width = 100, fix = "end"); print(resized_gr_end)
gr_query <- GRanges("chr1", IRanges(c(120, 360), c(180, 410)), score=c(50, 80))
print("Query Ranges:"); print(gr_query); print("Subject Ranges (gr):");
print(gr)
overlaps <- findOverlaps(gr_query, gr); print("Overlap results (indices):");
print(overlaps)
query_hits <- gr_query[queryHits(overlaps)]; subject_hits <-
gr[subjectHits(overlaps)]
print("Query ranges that overlap:"); print(query_hits); print("Subject ranges
that overlap:"); print(subject_hits)
subset_by_overlap <- subsetByOverlaps(gr, gr_query); print("Ranges in gr that
overlap with gr_query:"); print(subset_by_overlap)
nearest_indices <- nearest(gr_query, gr); print("Index of nearest range in gr
for each range in gr_query:"); print(nearest_indices);
print(gr[nearest_indices])
distances <- distanceToNearest(gr_query, gr); print("Distances to nearest
ranges:"); print(distances)
gr_a <- GRanges("chr1", IRanges(c(100, 200), c(150, 250))); gr_b <-
GRanges("chr1", IRanges(c(130, 260), c(180, 300)))
union_gr <- union(gr_a, gr_b); intersect_gr <- intersect(gr_a, gr_b);
setdiff_gr <- setdiff(gr_a, gr_b)
print("Union:"); print(union_gr); print("Intersect:"); print(intersect_gr);
print("Set Difference (A-B):"); print(setdiff_gr)
coverage_chr1 <- coverage(gr[seqnames(gr) == "chr1"]); print(coverage_chr1)

# rtracklayer
library(rtracklayer)
bed_content <- c("chr1\t100\t150\tGeneA\t10\t+", "chr1\t300\t380\tGeneB\t25\t-
", "chr2\t200\t250\tGeneC\t15\t+")
writeLines(bed_content, "example.bed"); bed_gr <- import("example.bed", format
= "bed"); print(bed_gr)
gtf_content <- c("chr1\tHAVANA\texon\t11869\t12227\t.\t+\t.\tgene_id
\"ENSG001\"; transcript_id \"ENST001\"; exon_number 1;",
"chr1\tHAVANA\texon\t12613\t12721\t.\t+\t.\tgene_id \"ENSG001\"; transcript_id
\"ENST001\"; exon_number 2;",
"chr1\tHAVANA\tgene\t11869\t14409\t.\t+\t.\tgene_id \"ENSG001\"; gene_name
\"DDX11L1\";", "chr2\tHAVANA\texon\t5000\t5500\t.\t-\t.\tgene_id \"ENSG002\";
transcript_id \"ENST002\"; exon_number 1;")
writeLines(gtf_content, "example.gtf"); gtf_gr <- import("example.gtf", format
= "gtf"); print(gtf_gr); print(mcols(gtf_gr))
exons_gr <- gtf_gr[gtf_gr$type == "exon"]; print(exons_gr)
exons_gr$`score <- sample(1:100, length(exons_gr), replace=TRUE);
exons_gr`$name <- paste0("Exon_", 1:length(exons_gr))
export(exons_gr, "exported_exons.bed", format = "bed")

# SummarizedExperiment
library(SummarizedExperiment)
counts_matrix <- matrix(rpois(100, lambda = 10), nrow = 20, ncol = 5, dimnames
= list(paste0("Gene", 1:20), paste0("Sample", 1:5)))
print(head(counts_matrix))
feature_data <- GRanges(seqnames = sample(c("chr1", "chr2"), 20, replace =
TRUE), ranges = IRanges(start = sample(1:10000, 20), width = sample(50:500,
20)), strand = sample(c("+", "-"), 20, replace = TRUE), gene_id =
paste0("Gene", 1:20), gc_content = rnorm(20, 0.5, 0.05))
names(feature_data) <- rownames(counts_matrix); print(feature_data)
sample_data <- DataFrame(Treatment = rep(c("Control", "Treated"), length.out =
5), Replicate = c(1, 1, 2, 2, 3), row.names = colnames(counts_matrix))
print(sample_data)
se <- SummarizedExperiment(assays = list(counts = counts_matrix), rowData =
feature_data, colData = sample_data); print(se)
assay(se); assays(se); rowData(se); colData(se); dim(se); nrow(se); ncol(se)
se_subset_genes <- se[1:5, ]; print(se_subset_genes)
se_subset_samples <- se[, se$Treatment == "Treated"]; print(se_subset_samples)
se_subset_both <- se[rowData(se)$`gc_content > 0.55, se`$Replicate == 1];
print(se_subset_both)
normalized_counts <- log2(counts_matrix + 1); assay(se, "logcounts") <-
normalized_counts; print(assays(se))

# Chapter 5 - Data Visualization


library(ggplot2); library(dplyr)
data(iris); iris_tbl <- as_tibble(iris)
base_plot <- ggplot(data = iris_tbl, mapping = aes(x = Sepal.Length, y =
Sepal.Width, color = Species))
scatter_plot <- base_plot + geom_point(); print(scatter_plot)
ggplot(data = iris_tbl, aes(x = Sepal.Length, y = Sepal.Width, color =
Species)) + geom_point()
set.seed(123); num_proteins <- 1000
proteomics_data <- tibble(ProteinID = paste0("Prot", 1:num_proteins),
Control_Abundance = rnorm(num_proteins, mean = 10, sd = 2), Treated_Abundance =
Control_Abundance + rnorm(num_proteins, mean = 0.5, sd = 1.5))
proteomics_data$`Treated_Abundance[1:50] <-
proteomics_data`$Treated_Abundance[1:50] + rnorm(50, mean = 3, sd = 1)
proteomics_data$`Treated_Abundance[51:100] <-
proteomics_data`$Treated_Abundance[51:100] - rnorm(50, mean = 3, sd = 1)
scatter_proteomics <- ggplot(proteomics_data, aes(x = Control_Abundance, y =
Treated_Abundance)) + geom_point(alpha = 0.5) + labs(title = "Protein
Abundance: Treated vs. Control", x = "Log2 Abundance (Control)", y = "Log2
Abundance (Treated)") + theme_minimal()
print(scatter_proteomics)
scatter_proteomics_diag <- scatter_proteomics + geom_abline(intercept = 0,
slope = 1, color = "red", linetype = "dashed"); print(scatter_proteomics_diag)
box_plot_iris <- ggplot(iris_tbl, aes(x = Species, y = Sepal.Width, fill =
Species)) + geom_boxplot() + labs(title = "Sepal Width Distribution by
Species", x = "Species", y = "Sepal Width") + theme_light()
print(box_plot_iris)
violin_plot_iris <- ggplot(iris_tbl, aes(x = Species, y = Petal.Length, fill =
Species)) + geom_violin(trim = FALSE, alpha = 0.7) + geom_jitter(width = 0.1,
height = 0, alpha = 0.5, size = 1.5) + labs(title = "Petal Length Distribution
by Species", x = "Species", y = "Petal Length") + theme_bw()
print(violin_plot_iris)
histogram_petal <- ggplot(iris_tbl, aes(x = Petal.Length)) +
geom_histogram(binwidth = 0.2, fill = "skyblue", color = "black") + labs(title
= "Histogram of Petal Lengths", x = "Petal Length", y = "Frequency")
print(histogram_petal)
density_petal_species <- ggplot(iris_tbl, aes(x = Petal.Length, fill = Species,
color = Species)) + geom_density(alpha = 0.5) + labs(title = "Density of Petal
Lengths by Species", x = "Petal Length", y = "Density")
print(density_petal_species)
bar_plot_counts <- ggplot(iris_tbl, aes(x = Species, fill = Species)) +
geom_bar() + labs(title = "Count of Observations per Species", x = "Species", y
= "Count")
print(bar_plot_counts)
iris_summary <- iris_tbl %>% group_by(Species) %>% summarise(Avg.Sepal.Length =
mean(Sepal.Length)); print(iris_summary)
bar_plot_means <- ggplot(iris_summary, aes(x = Species, y = Avg.Sepal.Length,
fill = Species)) + geom_col() + labs(title = "Average Sepal Length per
Species", x = "Species", y = "Average Sepal Length")
print(bar_plot_means)
set.seed(456)
heatmap_data_matrix <- matrix(rnorm(50 * 10, mean = 0, sd = 1.5), nrow = 50,
ncol = 10, dimnames = list(paste0("Prot", 1:50), paste0("Sample", 1:10)))
heatmap_data_matrix[, 1:5] <- heatmap_data_matrix[, 1:5] + 1;
heatmap_data_matrix[1:20, ] <- heatmap_data_matrix[1:20, ] + 1.5
library(tidyr); library(tibble)
heatmap_data_long <- as.data.frame(heatmap_data_matrix) %>%
rownames_to_column(var = "ProteinID") %>% pivot_longer(cols = -ProteinID,
names_to = "SampleID", values_to = "Log2FoldChange")
heatmap_plot <- ggplot(heatmap_data_long, aes(x = SampleID, y = ProteinID, fill
= Log2FoldChange)) + geom_tile(color = "white", size = 0.1) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
labs(title = "Protein Expression Heatmap", x = "Sample", y = "Protein", fill =
"Log2 FC") + theme_minimal() + theme(axis.text.y = element_text(size = 6),
axis.text.x = element_text(angle = 45, hjust = 1, size = 8), legend.position =
"bottom")
print(heatmap_plot)
set.seed(789); num_proteins_volcano <- 2000
volcano_data <- tibble(ProteinID = paste0("Prot", 1:num_proteins_volcano),
log2FoldChange = rnorm(num_proteins_volcano, mean = 0, sd = 1.5), p.value =
runif(num_proteins_volcano, min = 0, max = 1))
up_indices <- 1:100; volcano_data$`log2FoldChange[up_indices] <-
rnorm(length(up_indices), mean = 3, sd = 1); volcano_data`$p.value[up_indices]
<- rbeta(length(up_indices), 1, 50)
down_indices <- 101:200; volcano_data$`log2FoldChange[down_indices] <-
rnorm(length(down_indices), mean = -3, sd = 1);
volcano_data`$p.value[down_indices] <- rbeta(length(down_indices), 1, 50)
volcano_data <- volcano_data %>% mutate(negLog10p = -log10(p.value),
Significance = case_when(log2FoldChange > 1.5 & p.value < 0.01 ~ "Upregulated",
log2FoldChange < -1.5 & p.value < 0.01 ~ "Downregulated", TRUE ~ "Not
Significant"), Significance = factor(Significance, levels = c("Upregulated",
"Downregulated", "Not Significant")))
volcano_plot <- ggplot(volcano_data, aes(x = log2FoldChange, y = negLog10p,
color = Significance)) + geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue",
"Not Significant" = "grey")) + labs(title = "Volcano Plot of Differential
Protein Abundance", x = "Log2 Fold Change", y = "-Log10 P-value") +
theme_minimal() + theme(legend.position = "bottom") + geom_hline(yintercept = -
log10(0.01), linetype = "dashed", color = "darkgrey") + geom_vline(xintercept =
c(-1.5, 1.5), linetype = "dashed", color = "darkgrey")

# Chapter 6 - Shiny
library(shiny); library(ggplot2)
ui <- fluidPage(titlePanel("Old Faithful Geyser Data Histogram"),
sidebarLayout(sidebarPanel(sliderInput(inputId = "bins", label = "Number of
bins:", min = 1, max = 50, value = 30)), mainPanel(plotOutput(outputId =
"distPlot"))))
server <- function(input, output) { output$`distPlot <- renderPlot({ x <-
faithful[, 2]; bins <- seq(min(x), max(x), length.out = input`$bins + 1);
hist(x, breaks = bins, col = "darkgray", border = "white", xlab = "Waiting time
to next eruption (in mins)", main = "Histogram of waiting times") }) }
shinyApp(ui = ui, server = server)

# Chapter 7 - Performance
slow_function <- function(n) { result <- 0; for (i in 1:n) { result <- result +
log(sqrt(i^2.5)) }; Sys.sleep(0.1); return(result) }
execution_time <- system.time({ result_single <- slow_function(10000) });
print(execution_time)
num_reps <- 10
system.time({ results_sequential <- vector("list", num_reps); for(i in
1:num_reps) { results_sequential[[i]] <- slow_function(5000) } })

You might also like