Filtrado_variantes

Cargar librería

library(data.table)

Leer el archivo vcf del cromosoma 22, pero sólo unas muestras y saltando unas líneas

vcf_chr22 <- fread("ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz", 
                   skip = "#CHROM", 
                   na.strings = ".", 
                   select = c(1,2,4,5,sample(x = c(10:2513), size = 100, replace = FALSE)))

Renombrar y organizar columnas

setnames(vcf_chr22, "#CHROM", "CHROM")
vcf_chr22[, ID := paste(CHROM, POS, sep = "_")]
setcolorder(vcf_chr22, c(colnames(vcf_chr22)[1:2], "ID", colnames(vcf_chr22)[3:(ncol(vcf_chr22) - 1)]))

Quitar variantes que sólo tengan alelo alternativo, es decir que no sean monomorficas, así como sólo quedarnos con variantes de un único nucleótido.

vcf_chr22 <- vcf_chr22[!is.na(ALT)]
vcf_chr22 <- vcf_chr22[grepl("^[ATCG]$", ALT)]
vcf_chr22 <- vcf_chr22[grepl("^[ATCG]$", REF)]

Quitar variantes con más del 5% de datos faltantes

genotype_cols <- names(vcf_chr22)[6:ncol(vcf_chr22)] # columns with genotype data

# Create a temporary column for counting missing genotypes
vcf_chr22[, missing_count := rowSums(.SD == ".", na.rm = TRUE), .SDcols = genotype_cols]

# Calculate total number of samples (excluding the fixed columns)
num_samples <- length(genotype_cols)

# Calculate the percentage of missing data
vcf_chr22[, missing_pct := missing_count / (num_samples * 2)]  # Each sample has two alleles

# Keep rows with less than 5% missing data
vcf_chr22 <- vcf_chr22[missing_pct < 0.05]

# Optionally remove the temporary columns
vcf_chr22[, `:=`(missing_count = NULL, missing_pct = NULL)]

Quitar muestras con más del 5% de datos faltantes

#Identify individuals with < 5% missing data
missing_counts <- vcf_chr22[, lapply(.SD, function(x) sum(x == ".")), .SDcols = genotype_cols]

# Calculate total number of variants
total_variants <- nrow(vcf_chr22)

# Calculate percentage of missing data for each individual
missing_pct <- colSums(missing_counts) / total_variants

# Identify individuals to keep (< 5% missing data)
individuals_to_keep <- names(missing_pct[missing_pct < 0.05])

# Filter the dataset to keep only relevant columns
info <- c("CHROM", "POS", "ID", "REF", "ALT")
info_ikeep <- c(info, individuals_to_keep)
vcf_chr22 <- vcf_chr22[, ..info_ikeep]

¿Cómo traducimos los genotipos a un número?

vcf_chr22[, (genotype_cols) := lapply(.SD, function(gt) {
  # Split the genotype string (e.g., "0|1") and count the number of '1's
  sapply(strsplit(gt, "\\|"), function(alleles) sum(alleles == "1", na.rm = TRUE))
}), .SDcols = individuals_to_keep]

¿Qué sabemos sobre nuestras muestras?

En esta ocasión sólo mantendremos la información sobre el cohort al que pertenecen

metadata <- fread("igsr_samples_1000G.tsv")
setnames(metadata, "Sample name", "Sample_name")
setnames(metadata, "Population code", "Population_code")
metadata <- metadata[, .(Sample_name, Population_code)]

Conversión del formato

Este paso lo llevamos a cabo para poder agregar información en la misma tabla sobre el cohort de la muestra.

vcf_chr22_long <- melt(vcf_chr22, 
                 id.vars = c("CHROM", "POS", "ID", "REF", "ALT"),  # Columns to keep as identifiers
                 measure.vars = individuals_to_keep,                     # Genotype columns (samples)
                 variable.name = "Sample_name",                    # Name of the new 'sample' column
                 value.name = "Genotype")                          # Values will be in 'Genotype'

Unir nuestro metadata con los datos de genotipado.

vcf_chr22_long <- merge(vcf_chr22_long, metadata, by = "Sample_name")

Algunos estadísticos de utilidad: frecuencia alélica

# Calculate the number of alternative alleles per variant and per population
variant_summary <- vcf_chr22_long[, .(
  Total_alt_alleles = sum(Genotype, na.rm = TRUE),
  Num_individuals = uniqueN(Sample_name)
  # Sum the Genotype values to get total alternative alleles for each variant
  ), by = .( ID, Population_code)]  # Group by variant and population

variant_summary[, alt_freq:= Total_alt_alleles/(Num_individuals*2)]