library(data.table)
Filtrado_variantes
Cargar librería
Leer el archivo vcf del cromosoma 22, pero sólo unas muestras y saltando unas líneas
<- fread("ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz",
vcf_chr22 skip = "#CHROM",
na.strings = ".",
select = c(1,2,4,5,sample(x = c(10:2513), size = 100, replace = FALSE)))
Renombrar y organizar columnas
setnames(vcf_chr22, "#CHROM", "CHROM")
:= paste(CHROM, POS, sep = "_")]
vcf_chr22[, ID setcolorder(vcf_chr22, c(colnames(vcf_chr22)[1:2], "ID", colnames(vcf_chr22)[3:(ncol(vcf_chr22) - 1)]))
Quitar variantes que sólo tengan alelo alternativo, es decir que no sean monomorficas, así como sólo quedarnos con variantes de un único nucleótido.
<- vcf_chr22[!is.na(ALT)]
vcf_chr22 <- vcf_chr22[grepl("^[ATCG]$", ALT)]
vcf_chr22 <- vcf_chr22[grepl("^[ATCG]$", REF)] vcf_chr22
Quitar variantes con más del 5% de datos faltantes
<- names(vcf_chr22)[6:ncol(vcf_chr22)] # columns with genotype data
genotype_cols
# Create a temporary column for counting missing genotypes
:= rowSums(.SD == ".", na.rm = TRUE), .SDcols = genotype_cols]
vcf_chr22[, missing_count
# Calculate total number of samples (excluding the fixed columns)
<- length(genotype_cols)
num_samples
# Calculate the percentage of missing data
:= missing_count / (num_samples * 2)] # Each sample has two alleles
vcf_chr22[, missing_pct
# Keep rows with less than 5% missing data
<- vcf_chr22[missing_pct < 0.05]
vcf_chr22
# Optionally remove the temporary columns
`:=`(missing_count = NULL, missing_pct = NULL)] vcf_chr22[,
Quitar muestras con más del 5% de datos faltantes
#Identify individuals with < 5% missing data
<- vcf_chr22[, lapply(.SD, function(x) sum(x == ".")), .SDcols = genotype_cols]
missing_counts
# Calculate total number of variants
<- nrow(vcf_chr22)
total_variants
# Calculate percentage of missing data for each individual
<- colSums(missing_counts) / total_variants
missing_pct
# Identify individuals to keep (< 5% missing data)
<- names(missing_pct[missing_pct < 0.05])
individuals_to_keep
# Filter the dataset to keep only relevant columns
<- c("CHROM", "POS", "ID", "REF", "ALT")
info <- c(info, individuals_to_keep)
info_ikeep <- vcf_chr22[, ..info_ikeep] vcf_chr22
¿Cómo traducimos los genotipos a un número?
:= lapply(.SD, function(gt) {
vcf_chr22[, (genotype_cols) # Split the genotype string (e.g., "0|1") and count the number of '1's
sapply(strsplit(gt, "\\|"), function(alleles) sum(alleles == "1", na.rm = TRUE))
= individuals_to_keep] }), .SDcols
¿Qué sabemos sobre nuestras muestras?
En esta ocasión sólo mantendremos la información sobre el cohort al que pertenecen
<- fread("igsr_samples_1000G.tsv")
metadata setnames(metadata, "Sample name", "Sample_name")
setnames(metadata, "Population code", "Population_code")
<- metadata[, .(Sample_name, Population_code)] metadata
Conversión del formato
Este paso lo llevamos a cabo para poder agregar información en la misma tabla sobre el cohort de la muestra.
<- melt(vcf_chr22,
vcf_chr22_long id.vars = c("CHROM", "POS", "ID", "REF", "ALT"), # Columns to keep as identifiers
measure.vars = individuals_to_keep, # Genotype columns (samples)
variable.name = "Sample_name", # Name of the new 'sample' column
value.name = "Genotype") # Values will be in 'Genotype'
Unir nuestro metadata con los datos de genotipado.
<- merge(vcf_chr22_long, metadata, by = "Sample_name") vcf_chr22_long
Algunos estadísticos de utilidad: frecuencia alélica
# Calculate the number of alternative alleles per variant and per population
<- vcf_chr22_long[, .(
variant_summary Total_alt_alleles = sum(Genotype, na.rm = TRUE),
Num_individuals = uniqueN(Sample_name)
# Sum the Genotype values to get total alternative alleles for each variant
= .( ID, Population_code)] # Group by variant and population
), by
:= Total_alt_alleles/(Num_individuals*2)] variant_summary[, alt_freq