Analysis Surrounding HRP2 and HRP3 deletions
  • Home
  • Final Manuscript
  • Window Analysis
    • Windows
    • Running Haplotype Reconstruction on Windows
    • Genomic Locations Of Final Windows

    • Window analysis by coverage
    • Processing Coverage Initial Windows
    • Processing Coverage on Sub Windows

    • Window analysis of deletion patterns
    • Telomere healing
    • Processing Samples with HRP2 Deletions TARE1
    • Processing Samples with Chr11 Deletions TARE1
    • Processing Samples for chr13 TARE1 presence
    • pfmdr1 duplication
    • Processing Samples with pfhrp3 13-5++ deletion pattern

    • Final Coverage Windows
    • Processing Coverage on Sub Windows - final

    • Window analysis by sequence/variation
    • Plotting haplotype variation within regions

    • Analysis by SNP variant analysis
    • Calling variants and Estimating COI
    • Plotting BiAllelic Variant Plots
  • HB3/SD01 Longreads analysis
    • Set up
    • Creating Hybrid genomes

    • Spanning Raw Reads analysis
    • Processing Spanning Reads
    • SD01 spanning specific

    • HB3
    • Processing chr11 and chr13
    • Final Process Assembly

    • SD01
    • Running SD01 assemblies
    • Processing SD01 assemblies

    • Both
    • Illumina against HB3/SD01 Assemblies
    • Comparison To 3D7 Simplified View
  • rRNA Segmental Duplications

    • Chr11/13 Duplicated Region
    • Characterizing Duplicated Region
  • Related Genomic Regions Vis
    • Analysis
    • Finding shared regions genome wide
    • Mapping out surrounding Genes on Assembled Strains

    • Misc
    • Plotting HRPs Tandem Repeats
    • Info on all rRNA
  • Comparing to related Plasmodiums
    • Comparing to all 6 Plasmodium Laverania
    • Comparing to all 6 Plasmodium Laverania Gene Arrangements chr05,07,08,11,13
    • Comparing to HRP2/3 falciparum sequences
  • References
    • Getting Raw Data References
    • References
    • R Session and Commandline tools info

Contents

  • Investigating shared region within pacbio assemblies(Otto et al. 2018)
  • Beginning of the section
  • Middle
  • End of the section
  • Conserved score
  • Comparing chromsome 11 and 13 segments
    • Comparing Percent Identity
    • Comparing Number of variants
  • Chromosome vs chromosome percent identity
  • Writing out region

Characterizing Duplicated Region

  • Show All Code
  • Hide All Code

  • View Source

Investigating shared region within pacbio assemblies(Otto et al. 2018)

Code
for x in /tank/data/genomes/plasmodium/genomes/pf/genomes/*.fasta; do elucidator splitFile --fasta ${x} --overWrite --trimAtWhiteSpace  ; done;

These strains have complete chromosome 11s and 13s

Pf3D7
Pf7G8
PfCD01
PfDd2
PfGA01
PfGB4
PfGN01
PfKE01
PfKH02
PfSN01

Code
strains = c("Pf3D7","Pf7G8","PfCD01","PfDd2","PfGA01","PfGB4","PfGN01","PfKE01","PfKH02","PfSN01")

cat(strains, sep = "\n", file = "stableStrains.txt")

strainsDf = tibble(strain = strains)
strainsDf = strainsDf %>% 
  mutate(chrom11 = paste0(strain, "_11*.fasta")) %>% 
  mutate(chrom13 = paste0(strain, "_13*.fasta"))

mvCmd = paste0("mv ", paste0(strainsDf$chrom11, collapse = " "), " ", paste0(strainsDf$chrom13, collapse = " "), " combinedChrom11_13/")

cat(mvCmd)
mv Pf3D7_11*.fasta Pf7G8_11*.fasta PfCD01_11*.fasta PfDd2_11*.fasta PfGA01_11*.fasta PfGB4_11*.fasta PfGN01_11*.fasta PfKE01_11*.fasta PfKH02_11*.fasta PfSN01_11*.fasta Pf3D7_13*.fasta Pf7G8_13*.fasta PfCD01_13*.fasta PfDd2_13*.fasta PfGA01_13*.fasta PfGB4_13*.fasta PfGN01_13*.fasta PfKE01_13*.fasta PfKH02_13*.fasta PfSN01_13*.fasta combinedChrom11_13/
Code
elucidator profileSharedKmerBlocks --seq Pf3D7_11_v3.fasta --dout chroms11_13_klen31 --kLen 31 --seqsDir ./ --fasta Pf3D7_11_v3.fasta --overWriteDir

cd chroms11_13_klen31

elucidator bedCoordSort --bed perfect.bed | elucidator extendBedRegions --left 5 --right 5 --bed STDIN | bedtools merge | elucidator bed3ToBed6 --bed STDIN  | elucidator filterBedRecordsByLength --bed STDIN  --minLen 65 | elucidator bedGetIntersectingGenesInGff  --gff /work/nhathawa/genomes/pfGenomes_updated/info//gff/Pf3D7.gff --extraAttributes description --overWrite --bed STDIN --out perfectlyShared_minlen65.bed

elucidator getFastaWithBed  --twoBit /tank/data/genomes/plasmodium/genomes/pf/genomes/Pf3D7.2bit  --overWrite --bed perfectlyShared_minlen65.bed --out perfectlyShared_minlen65


elucidator extractRefSeqsFromGenomes --genomeDir /tank/data/genomes/plasmodium/genomes/pf/genomes/ --gffDir /tank/data/genomes/plasmodium/genomes/pf/info/gff/ --primaryGenome Pf3D7 --bed perfectlyShared_minlen65.bed --outputDir extractedRefSeqs --identity 95 --coverage 95 --numThreads 15
Code
perfectlyShared_minlen65_regions = readr::read_tsv("combinedChrom11_13/chroms11_13_klen31/perfectlyShared_minlen65.bed", col_names = F)
perfectlyShared_minlen65_regions = perfectlyShared_minlen65_regions %>% 
  mutate(X4 = paste0(X4, "-for"))

allRegions = tibble()
for(region in perfectlyShared_minlen65_regions$X4){
  for(strain in strains){
    fnp= paste0("combinedChrom11_13/chroms11_13_klen31/extractedRefSeqs/", region, "/beds/", strain, "_region.bed")
    if(file.exists(fnp)){
      currentRegion = readr::read_tsv(fnp, col_names = F, show_col_types = F)
      currentRegion$X4 = region
      currentRegion$strain = strain
      allRegions = bind_rows(
        allRegions, currentRegion
      )
    }
  }
}

allRegions_sum = allRegions %>% 
  group_by(strain, X1) %>% 
  summarise(start = min(X2), 
         end = max(X3)) %>% 
  filter(grepl("_1[13]", X1)) %>% 
  rename(`#chrom` = X1) %>% 
  mutate(start = start - 5000) %>% 
  mutate(end = end + 5000) %>% 
  mutate(name = paste0(`#chrom`, "-", start, "-", end), 
         len = end - start, 
         strand = "+") 

for(strainName in strains){
  allRegions_sum_byStrain = allRegions_sum %>% 
    filter(strain == strainName)
  allRegions_sum_byStrain = allRegions_sum_byStrain %>% 
    group_by() %>% 
    select(-strain)
  write_tsv(allRegions_sum_byStrain, paste0("combinedChrom11_13/chroms11_13_regions/", strainName, ".bed"))
}




allRegions_sum_exact = allRegions %>% 
  group_by(strain, X1) %>% 
  summarise(start = min(X2), 
         end = max(X3)) %>% 
  filter(grepl("_1[13]", X1)) %>% 
  rename(`#chrom` = X1) %>% 
  mutate(start = start) %>% 
  mutate(end = end) %>% 
  mutate(name = paste0(`#chrom`, "-", start, "-", end), 
         len = end - start, 
         strand = "+") 

for(strainName in strains){
  allRegions_sum_byStrain = allRegions_sum_exact %>% 
    filter(strain == strainName)
  allRegions_sum_byStrain = allRegions_sum_byStrain %>% 
    group_by() %>% 
    select(-strain)
  write_tsv(allRegions_sum_byStrain, paste0("combinedChrom11_13/chroms11_13_regions_exact/", strainName, ".bed"))
}
Code
cd chroms11_13_regions_exact 

for x in *.bed; do elucidator getFastaWithBed --bed $x --twoBit /tank/data/genomes/plasmodium/genomes/pf/genomes/${x%%.bed}.2bit --overWrite --out ${x%%.bed}.fasta ; done;

mkdir sequences
cd sequences

for x in ../*.fasta; do elucidator splitFile --fasta  $x ; done;

elucidator profileSharedKmerBlocks --seq Pf3D7_11_v3-1913023-1938396.fasta --dout chroms11_13_klen31 --kLen 31 --seqsDir ./ --fasta Pf3D7_11_v3-1913023-1938396.fasta --overWriteDir
Code
cd chroms11_13_regions

mkdir allSequences
for x in Pf*.bed; do elucidator getFastaWithBed --bed $x --twoBit /tank/data/genomes/plasmodium/genomes/pf/genomes/${x%%.bed}.2bit --overWrite --out ${x%%.bed}.fasta ; done;

cat *.fasta > allSequences/all.fasta
cat Pf*.bed > all.bed
cd allSequences

muscle -in all.fasta -out aln_all.fasta

elucidator trimToLen --fasta aln_all.fasta --length 21945 --overWrite --out trimmed_to_21945_aln_all.fasta
elucidator trimFront --fasta trimmed_to_21945_aln_all.fasta --forwardBases 21845 --overWrite --out trimmedFrom_21845_trimmed_to_21945_aln_all.fasta
elucidator sortReads --fasta trimmedFrom_21845_trimmed_to_21945_aln_all.fasta --removeGaps  --overWrite
elucidator determineRegionLastz --individual --fasta sorted_trimmedFrom_21845_trimmed_to_21945_aln_all.fasta --genome /tank/data/genomes/plasmodium/genomes/pf/genomes/Pf3D7.2bit   -out sorted_trimmedFrom_21845_trimmed_to_21945_aln_all.bed 


elucidator trimToLen --fasta aln_all.fasta --length 6051 --overWrite --out trimmed_to_6051_aln_all.fasta
elucidator trimFront --fasta trimmed_to_6051_aln_all.fasta --forwardBases 5951 --overWrite --out trimmedFrom_5951_trimmed_to_6051_aln_all.fasta
elucidator sortReads --fasta trimmedFrom_5951_trimmed_to_6051_aln_all.fasta --removeGaps  --overWrite
elucidator determineRegionLastz --individual --fasta sorted_trimmedFrom_5951_trimmed_to_6051_aln_all.fasta --genome /tank/data/genomes/plasmodium/genomes/pf/genomes/Pf3D7.2bit --overWrite --out sorted_trimmedFrom_5951_trimmed_to_6051_aln_all.bed 

elucidatorlab getMalnConservedness --fasta aln_all.fasta  --blockSize 10 --overWrite --out aln_all_conserved.tab.txt  
elucidatorlab getAlnPosToRealPosTable --fasta aln_all.fasta --overWrite --out aln_all_position_table.tab.txt  

Beginning of the section

Pf3D7_11_v3-1918029-1918124–Pf3D7_13_v3-2792022-2792117

Middle

Variation in the middle, does not appear to be chromosome specific (though could have been a difference in assembly)

End of the section

Pf3D7_11_v3-1933291-1933390–Pf3D7_13_v3-2807298-2807397

The region is ~15,360 kp long, so could be hard to get through even with long reads

Conserved score

Plotting the percent conserved per position across the region including the before and after shared segment, all strains appear to start and end at the same position

Code
conservedPosTab = readr::read_tsv("combinedChrom11_13/chroms11_13_regions/allSequences/aln_all_position_table.tab.txt") %>% 
  filter("Pf3D7_11_v3-1913023-1938396" == name) %>% 
  rename(pos = alignPosition)

conservedMulti = readr::read_tsv("combinedChrom11_13/chroms11_13_regions/allSequences/aln_all_conserved.tab.txt")

conservedMulti_max = conservedMulti %>% 
  group_by(pos) %>% 
  filter(frac == max(frac)) %>% 
  filter("----------" != block ) %>% 
  left_join(conservedPosTab) %>% 
  filter(!is.na(name)) %>% 
  filter()

blockSize = 30

conservedMulti_max_rounding = tibble(rawPos = 1:(nrow(conservedMulti_max) - blockSize),
                                     realPosition = numeric(nrow(conservedMulti_max) - blockSize), 
                                     rounded = numeric(nrow(conservedMulti_max) - blockSize))


for(row in 1:(nrow(conservedMulti_max) - blockSize)){
  round = mean(conservedMulti_max$frac[row:(row+blockSize)])
  if(round > 1){
    print(row)
    print(blockSize)
    print(conservedMulti_max$frac[row:(row+blockSize)])
    print(round)
    stop()
  }
  minPos = min(conservedMulti_max$realPosition[row:(row+blockSize)])
  conservedMulti_max_rounding$realPosition[row] = minPos
  conservedMulti_max_rounding$rounded[row] = round
}

conservedMulti_max_rounding_filt = conservedMulti_max_rounding %>% 
  group_by(realPosition) %>% 
  mutate(id = row_number()) %>% 
  filter(id == 1) %>% 
  group_by()
Code
ggplot(conservedMulti_max_rounding_filt) + 
  geom_bar(aes(x = realPosition, y = rounded), stat = "identity")+
  sofonias_theme

Code
cd chroms11_13_regions_exact

cat *.fasta > all.fasta 
elucidator compareAllByAll --fasta all.fasta --verbose --numThreads 12 
elucidator createSharedSubSegmentsFromRefSeqs --fasta out.fasta --minimumKlen 12 --dout sharedSubSeqs_occurence6 --filterRegionsWithinRegions --refSeqName Pf3D7_11_v3-1918023-1933396 --overWriteDir --genome /tank/data/genomes/plasmodium/genomes/pf/genomes/Pf3D7.fasta --minSubRegionLen 150 --maxSubRegionLen 300 --minBlockRegionLen 30 --doNotCollapseLowFreqNodes --correctionOccurenceCutOff 6 --includeFromSubRegionSize 30

muscle -in all.fasta -out aln_all.fasta

FastTree -nt aln_all.fasta > aln_all.nwk

elucidatorlab getMalnConservedness --fasta aln_all.fasta  --blockSize 10 --overWrite --out aln_all_conserved.tab.txt  
elucidatorlab getAlnPosToRealPosTable --fasta aln_all.fasta --overWrite --out aln_all_position_table.tab.txt  

elucidator multipleAlnProteinToPcaInput --fasta aln_all.fasta --overWrite --out pcaMat_aln_all.fasta
elucidator runTRF --fasta all.fasta --overWriteDir --dout all_trfOutput --supplement --genomicLocation ../all.bed

elucidator bedFilterRegionsCompletelyInOther --bed all_trfOutput/genomic_combined.bed --intersectWithBed all_trfOutput/genomic_combined.bed | elucidator bedAddSmartIDForPlotting --bed STDIN --out all_trfOutput/filtered_genomic_combined.bed --overWrite 

elucidator countBasesPerPosition --fasta aln_all.fasta --overWrite --out aln_all_baseCounts.tab.txt

#elucidator createSharedSubSegmentsFromRefSeqs --fasta out.fasta --minimumKlen 12 --dout sharedSubSeqs_occurence6 --filterRegionsWithinRegions --refSeqName Pf3D7_11_v3-1918023-1933396 --overWriteDir --genome /tank/data/genomes/plasmodium/genomes/pf/genomes/Pf3D7.fasta --minSubRegionLen 150 --maxSubRegionLen 300 --minBlockRegionLen 30 --doNotCollapseLowFreqNodes --correctionOccurenceCutOff 6 --includeFromSubRegionSize 30

#cd sharedSubSeqs_occurence6/sharedLocs

#elucidator bedGetIntersectingGenesInGff  --gff /tank/data/genomes/plasmodium/genomes/pf/info/gff/Pf3D7.gff --extraAttributes description --overWrite --bed 0_ref_sharedLocs_genomic.bed --out 0_ref_sharedLocs_genomic_withGeneInfo.bed

Comparing chromsome 11 and 13 segments

The chromosome 11 and 13 regions appear to be extremely closely related to each other and there is no clear variation specific to chromosome 11 or 13

Code
library(ggtree)
tree = ape::read.tree("combinedChrom11_13/chroms11_13_regions_exact/sequences/aln_all.nwk")


tree$tip.label = gsub("_v3", "", gsub("-.*", "", tree$tip.label))

tipDf = tibble(seq = tree$tip.label) %>% 
  mutate(chrom = ifelse(grepl("_11", seq), "chrom11", "chrom13"))
tipDf_chrom11 = tipDf %>% 
  filter(chrom == "chrom11")
tipDf_chrom13 = tipDf %>% 
  filter(chrom == "chrom13")

groups = list(chrom11 = tipDf_chrom11$seq, 
              chrom13 = tipDf_chrom13$seq)

tree = groupOTU(tree, groups)

sharedRegion_pacbioAssembliesPlot = ggtree::ggtree(tree, show.legend = FALSE) + 
  hexpand(.1) + 
  geom_tiplab(aes(color = group), show.legend = FALSE) + 
  scale_color_manual(values = c("#005AC8", "#AA0A3C")) 
Code
print(sharedRegion_pacbioAssembliesPlot)

Code
pdf("sharedRegion_pacbioAssemblies.pdf", width = 7.5, height = 7.5/2, useDingbats = F)
print(sharedRegion_pacbioAssembliesPlot)
dev.off()
quartz_off_screen 
                2 

Comparing Percent Identity

All segments between strains are more >99% related to each other

Code
allByAllComp = readr::read_tsv("combinedChrom11_13/chroms11_13_regions_exact/sequences/out.tab.txt")
Code
allByAllComp_select = allByAllComp %>% 
  select(OtherReadId, ReadId, perId) %>% 
  spread(ReadId, perId)

addRow =  matrix(c(colnames(allByAllComp_select)[2], rep(NA, ncol(allByAllComp_select) - 1)), nrow = 1)
colnames(addRow) = colnames(allByAllComp_select)
addRow = as_tibble(addRow) 
for(col in 2:ncol(addRow)){
  addRow[,col] = as.numeric(addRow[,col])
}


allByAllComp_select = as_tibble(addRow) %>% bind_rows(allByAllComp_select)


allByAllComp_select_mat = as.matrix(allByAllComp_select[,2:ncol(allByAllComp_select)])
rownames(allByAllComp_select_mat) = allByAllComp_select$OtherReadId

allByAllComp_select_mat = as.matrix(as.dist(allByAllComp_select_mat))
allByAllComp_select_mat[0 == allByAllComp_select_mat] = NA
Code
library(circlize)
meta = readr::read_tsv("../../meta/metadata/meta.tab.txt") %>% 
  mutate(country = gsub("South East Asia - East", "Cambodia", country))
metaByBioSample = readr::read_tsv("../../meta/metadata/metaByBioSample.tab.txt") %>% 
  mutate(country = gsub("South East Asia - East", "Cambodia", country))



allByAllComp_select_mat_noLabels = allByAllComp_select_mat
# rownames(allByAllComp_select_mat_noLabels) = NULL 
# colnames(allByAllComp_select_mat_noLabels) = NULL


rownames(allByAllComp_select_mat_noLabels) = gsub("Dd2", "DD2", gsub("Pf", "", gsub("_v3", "", gsub("-.*", "", rownames(allByAllComp_select_mat_noLabels)))))
colnames(allByAllComp_select_mat_noLabels) = gsub("Dd2", "DD2", gsub("Pf", "", gsub("_v3", "", gsub("-.*", "", colnames(allByAllComp_select_mat_noLabels)))))


col_fun = colorRamp2(c(
  min(allByAllComp_select_mat_noLabels, na.rm = T),
  min(allByAllComp_select_mat_noLabels, na.rm = T) + (
    max(allByAllComp_select_mat_noLabels, na.rm = T) - min(allByAllComp_select_mat_noLabels, na.rm = T)
  ) / 2,
  max(allByAllComp_select_mat_noLabels, na.rm = T)
),
c("#4575b4", "#ffffbf", "#d73027"))
#c("#ffeda0", "#feb24c", "#f03b20"))



topAnnoDat = tibble(name = colnames(allByAllComp_select_mat)) %>% 
  mutate(sampchrom = gsub("_v3", "", gsub("-.*", "", name))) %>% 
  mutate(sample = gsub("Dd2", "DD2", gsub("_.*", "", gsub("Pf", "", sampchrom)))) %>% 
  mutate(chrom = gsub(".*_", "", sampchrom)) %>% 
  left_join(metaByBioSample %>% 
              select(sample, secondaryRegion))

topAnno_df = topAnnoDat[,c("chrom", "secondaryRegion")] %>% 
  rename(continent = secondaryRegion) %>% 
  mutate(continent = gsub("Netherlands", "AFRICA", continent) ) %>% 
  as.data.frame()

topAnnoColors = createColorListFromDf(topAnno_df)

topAnnoColors[["continent"]] = c("AFRICA"  =   "#E69F00", 
                                 "ASIA" =      "#F0E442", 
                                 "OCEANIA"  =  "#F748A5", 
                                 "S_AMERICA" = "#0072B2", 
                                 "Netherlands"="black")

topAnnoColors[["chrom"]] = c("11" = "#008DF9", "13" = "#A40122")



topAnno = HeatmapAnnotation(
  df = topAnno_df,
  col = topAnnoColors,
  show_legend = F,
  gp = gpar(col = "black")
  # the annotation direction doesn't appear to work right now 
  # annotation_legend_param = list(
  #   chrom = list(legend_direction = "horizontal", direction = "horizontal"), 
  #   continent = list(legend_direction = "horizontal", direction = "horizontal")
  # )
)

sideAnno = rowAnnotation(
  df = topAnno_df,
  col = topAnnoColors,
  show_legend = F,
  gp = gpar(col = "black")
  # ,
  # annotation_legend_param = list(
  #   chrom = list(legend_direction = "horizontal", direction = "horizontal"), 
  #   continent = list(legend_direction = "horizontal", direction = "horizontal")
  # )
)

colnames(allByAllComp_select_mat_noLabels) = gsub("3D7", "**3D7", colnames(allByAllComp_select_mat_noLabels))
rownames(allByAllComp_select_mat_noLabels) = gsub("3D7", "**3D7", rownames(allByAllComp_select_mat_noLabels))

hc = hclust(dist(allByAllComp_select_mat_noLabels))
allByAllComp_select_mat_heatmap = Heatmap(
  allByAllComp_select_mat_noLabels,
  rect_gp = gpar(type = "none"),
  column_dend_side = "bottom",
  cell_fun = function(j, i, x, y, w, h, fill) {
    if (as.numeric(x) <= 1 - as.numeric(y) + 1e-6) {
      grid.rect(x, y, w, h, gp = gpar(fill = fill, col = fill))
    }
  },
  # cluster_rows = hc,
  # cluster_columns = hc,
  row_names_side = "left",
  name = "%Identity",
  col = col_fun,
  bottom_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"), 
  show_column_dend = F, 
  show_heatmap_legend = F
  
  # , 
  # heatmap_legend_param = list(
  #   direction = "horizontal"
  # )
)

chrom_lgd = Legend(labels = sort(unique(topAnno_df$chrom)), title = "chrom", legend_gp = gpar(fill = topAnnoColors[["chrom"]][sort(unique(topAnno_df$chrom))]))
continent_lgd = Legend(labels = sort(unique(topAnno_df$continent)), title = "continent", legend_gp = gpar(fill = topAnnoColors[["continent"]][sort(unique(topAnno_df$continent))]))
heatmap_lgd = Legend(col_fun = col_fun, title = "%Identity")
Code
draw(allByAllComp_select_mat_heatmap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "left")
draw(packLegend(heatmap_lgd,chrom_lgd,continent_lgd,  direction = "horizontal"), x = unit(0.775, "npc"), y = unit(0.85, "npc"))

Code
pdf("sharedRegion_pacbioAssemblies_percentIdentity.pdf", width = 7.5, height = 5, useDingbats = F)
draw(allByAllComp_select_mat_heatmap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "left")
draw(packLegend(heatmap_lgd,chrom_lgd,continent_lgd,  direction = "horizontal"), x = unit(0.775, "npc"), y = unit(0.85, "npc"))
dev.off()
quartz_off_screen 
                2 

Comparing Number of variants

Code
allByAllCompVarCount_select = allByAllComp %>% 
  select(OtherReadId, ReadId, totalDiffs) %>% 
  spread(ReadId, totalDiffs)

addRow =  matrix(c(colnames(allByAllCompVarCount_select)[2], rep(NA, ncol(allByAllCompVarCount_select) - 1)), nrow = 1)
colnames(addRow) = colnames(allByAllCompVarCount_select)
addRow = as_tibble(addRow) 
for(col in 2:ncol(addRow)){
  addRow[,col] = as.numeric(addRow[,col])
}


allByAllCompVarCount_select = as_tibble(addRow) %>% bind_rows(allByAllCompVarCount_select)


allByAllCompVarCount_select_mat = as.matrix(allByAllCompVarCount_select[,2:ncol(allByAllCompVarCount_select)])
rownames(allByAllCompVarCount_select_mat) = allByAllCompVarCount_select$OtherReadId

allByAllCompVarCount_select_mat = as.matrix(as.dist(allByAllCompVarCount_select_mat))
allByAllCompVarCount_select_mat[0 == allByAllCompVarCount_select_mat] = NA
Code
library(circlize)
meta = readr::read_tsv("../../meta/metadata//meta.tab.txt") %>% 
  mutate(country = gsub("South East Asia - East", "Cambodia", country))
metaByBioSample = readr::read_tsv("../../meta/metadata/metaByBioSample.tab.txt") %>% 
  mutate(country = gsub("South East Asia - East", "Cambodia", country))



allByAllCompVarCount_select_mat_noLabels = allByAllCompVarCount_select_mat
# rownames(allByAllCompVarCount_select_mat_noLabels) = NULL 
# colnames(allByAllCompVarCount_select_mat_noLabels) = NULL


rownames(allByAllCompVarCount_select_mat_noLabels) = gsub("Dd2", "DD2", gsub("Pf", "", gsub("_v3", "", gsub("-.*", "", rownames(allByAllCompVarCount_select_mat_noLabels)))))
colnames(allByAllCompVarCount_select_mat_noLabels) = gsub("Dd2", "DD2", gsub("Pf", "", gsub("_v3", "", gsub("-.*", "", colnames(allByAllCompVarCount_select_mat_noLabels)))))


col_fun = colorRamp2(c(
  min(allByAllCompVarCount_select_mat_noLabels, na.rm = T),
  min(allByAllCompVarCount_select_mat_noLabels, na.rm = T) + (
    max(allByAllCompVarCount_select_mat_noLabels, na.rm = T) - min(allByAllCompVarCount_select_mat_noLabels, na.rm = T)
  ) / 2,
  max(allByAllCompVarCount_select_mat_noLabels, na.rm = T)
),
c("#006837", "#ffffbf", "#d73027"))
#c("#4575b4", "#ffffbf", "#d73027"))
#c("#ffeda0", "#feb24c", "#f03b20"))



topAnnoDat = tibble(name = colnames(allByAllCompVarCount_select_mat)) %>% 
  mutate(sampchrom = gsub("_v3", "", gsub("-.*", "", name))) %>% 
  mutate(sample = gsub("Dd2", "DD2", gsub("_.*", "", gsub("Pf", "", sampchrom)))) %>% 
  mutate(chrom = gsub(".*_", "", sampchrom)) %>% 
  left_join(metaByBioSample %>% 
              select(sample, secondaryRegion))

topAnno_df = topAnnoDat[,c("chrom", "secondaryRegion")] %>% 
  rename(continent = secondaryRegion) %>% 
  mutate(continent = gsub("Netherlands", "AFRICA", continent) ) %>% 
  as.data.frame()

topAnnoColors = createColorListFromDf(topAnno_df)

topAnnoColors[["continent"]] = c("AFRICA"  =   "#E69F00", 
                                 "ASIA" =      "#F0E442", 
                                 "OCEANIA"  =  "#F748A5", 
                                 "S_AMERICA" = "#0072B2", 
                                 "Netherlands"="black")

topAnnoColors[["chrom"]] = c("11" = "#008DF9", "13" = "#A40122")



topAnno = HeatmapAnnotation(
  df = topAnno_df,
  col = topAnnoColors,
  show_legend = F,
  gp = gpar(col = "black")
  # the annotation direction doesn't appear to work right now 
  # annotation_legend_param = list(
  #   chrom = list(legend_direction = "horizontal", direction = "horizontal"), 
  #   continent = list(legend_direction = "horizontal", direction = "horizontal")
  # )
)

sideAnno = rowAnnotation(
  df = topAnno_df,
  col = topAnnoColors,
  show_legend = F,
  gp = gpar(col = "black")
  # ,
  # annotation_legend_param = list(
  #   chrom = list(legend_direction = "horizontal", direction = "horizontal"), 
  #   continent = list(legend_direction = "horizontal", direction = "horizontal")
  # )
)

colnames(allByAllCompVarCount_select_mat_noLabels) = gsub("3D7", "**3D7", colnames(allByAllCompVarCount_select_mat_noLabels))
rownames(allByAllCompVarCount_select_mat_noLabels) = gsub("3D7", "**3D7", rownames(allByAllCompVarCount_select_mat_noLabels))

hc = hclust(dist(allByAllCompVarCount_select_mat_noLabels))
allByAllCompVarCount_select_mat_heatmap = Heatmap(
  allByAllCompVarCount_select_mat_noLabels,
  rect_gp = gpar(type = "none"),
  column_dend_side = "bottom",
  cell_fun = function(j, i, x, y, w, h, fill) {
    if (as.numeric(x) <= 1 - as.numeric(y) + 1e-6) {
      grid.rect(x, y, w, h, gp = gpar(fill = fill, col = fill))
    }
  },
  # cluster_rows = hc,
  # cluster_columns = hc,
  row_names_side = "left",
  name = "Number of \nVariants",
  col = col_fun,
  bottom_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"), 
  show_column_dend = F, 
  show_heatmap_legend = F,
  na_col = "grey80",
  # , 
  # heatmap_legend_param = list(
  #   direction = "horizontal"
  # )
)

chrom_lgd = Legend(labels = sort(unique(topAnno_df$chrom)), title = "chrom", legend_gp = gpar(fill = topAnnoColors[["chrom"]][sort(unique(topAnno_df$chrom))]))
continent_lgd = Legend(labels = sort(unique(topAnno_df$continent)), title = "continent", legend_gp = gpar(fill = topAnnoColors[["continent"]][sort(unique(topAnno_df$continent))]))
heatmap_lgd = Legend(col_fun = col_fun, title = "Number of \nVariants")
Code
draw(allByAllCompVarCount_select_mat_heatmap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "left")
draw(packLegend(heatmap_lgd,chrom_lgd,continent_lgd,  direction = "horizontal"), x = unit(0.80, "npc"), y = unit(0.85, "npc"))

Code
pdf("sharedRegion_pacbioAssemblies_varCount.pdf", width = 7.5, height = 5, useDingbats = F)
draw(allByAllCompVarCount_select_mat_heatmap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "left")
draw(packLegend(heatmap_lgd,chrom_lgd,continent_lgd,  direction = "horizontal"), x = unit(0.80, "npc"), y = unit(0.85, "npc"))
dev.off()
quartz_off_screen 
                2 
Code
allByAllComp_select_mat_noLabels_mod = allByAllComp_select_mat_noLabels
allByAllComp_select_mat_noLabels_mod = allByAllComp_select_mat_noLabels_mod - min(allByAllComp_select_mat_noLabels_mod, na.rm = T)
allByAllComp_select_mat_noLabels_mod[is.na(allByAllComp_select_mat_noLabels_mod)] = 0
Heatmap3D(allByAllComp_select_mat_noLabels_mod - min(allByAllComp_select_mat_noLabels_mod))

Chromosome vs chromosome percent identity

Comparison of the chromosomes vs between chromosomes, for chromosome 11 and 13 they are just as closely related to the same chromosome as between the chromosomes, unlike the chr05 and chr07 chromosomes rRNA loci where the same chromosomes are more closely related than between chromosomes. Could be suggestive that chr11 and chr13 are inter-combining more than chr05 and chr07

Code
allByAllComp_mod = allByAllComp %>% 
  group_by(ReadId, OtherReadId) %>% 
  mutate(read1_chrom = ifelse(grepl("_11", ReadId), "chrom11", "chrom13"))%>% 
  mutate(read2_chrom = ifelse(grepl("_11", OtherReadId), "chrom11", "chrom13")) %>% 
  mutate(comparisonCategory = paste0(sort(c(read1_chrom, read2_chrom)), collapse = "--") ) %>% 
  ungroup()

ggplot(allByAllComp_mod) + 
  geom_boxplot(aes(x = comparisonCategory, y = perId)) + 
  sofonias_theme + 
  #scale_y_continuous(limits = c(min(allByAllComp_mod$perId), 1)) + 
  scale_y_continuous(limits = c(0.990, 1))

Writing out region

Code
outBed = tibble(
  `#chrom` = c("Pf3D7_11_v3", "Pf3D7_13_v3"), 
  start = c(1918029, 2792022), 
  end = c(1933390, 2807397), 
  name = c("Pf3D7_11_v3-1918029-1918124--Pf3D7_11_v3-1933291-1933390__Pf3D7_13_v3-2792022-2792117--Pf3D7_13_v3-2807298-2807397", 
           "Pf3D7_11_v3-1918029-1918124--Pf3D7_11_v3-1933291-1933390__Pf3D7_13_v3-2792022-2792117--Pf3D7_13_v3-2807298-2807397")
)

outBed = outBed %>% 
  mutate(len = end -start) %>% 
  mutate(strand = "+")

create_dt(outBed)
Code
write_tsv(outBed, "investigatingChrom11Chrom13/shared_11_13_region.bed")

References

Otto, Thomas D, Ulrike Böhme, Mandy Sanders, Adam Reid, Ellen I Bruske, Craig W Duffy, Pete C Bull, et al. 2018. “Long Read Assemblies of Geographically Dispersed Plasmodium Falciparum Isolates Reveal Highly Structured Subtelomeres.” Wellcome Open Res 3 (May): 52.
Source Code
---
title: "Characterizing Duplicated Region"
---

```{r setup, echo=FALSE, message=FALSE}
source("../../common.R")
```



## Investigating shared region within pacbio assemblies[@Otto2018-bb]

```{bash, eval = F}
for x in /tank/data/genomes/plasmodium/genomes/pf/genomes/*.fasta; do elucidator splitFile --fasta ${x} --overWrite --trimAtWhiteSpace  ; done;
```

These strains have complete chromosome 11s and 13s 

Pf3D7  
Pf7G8  
PfCD01  
PfDd2  
PfGA01  
PfGB4  
PfGN01  
PfKE01  
PfKH02  
PfSN01  


```{r}
strains = c("Pf3D7","Pf7G8","PfCD01","PfDd2","PfGA01","PfGB4","PfGN01","PfKE01","PfKH02","PfSN01")

cat(strains, sep = "\n", file = "stableStrains.txt")

strainsDf = tibble(strain = strains)
strainsDf = strainsDf %>% 
  mutate(chrom11 = paste0(strain, "_11*.fasta")) %>% 
  mutate(chrom13 = paste0(strain, "_13*.fasta"))

mvCmd = paste0("mv ", paste0(strainsDf$chrom11, collapse = " "), " ", paste0(strainsDf$chrom13, collapse = " "), " combinedChrom11_13/")

cat(mvCmd)

```

```{bash, eval = F}
elucidator profileSharedKmerBlocks --seq Pf3D7_11_v3.fasta --dout chroms11_13_klen31 --kLen 31 --seqsDir ./ --fasta Pf3D7_11_v3.fasta --overWriteDir

cd chroms11_13_klen31

elucidator bedCoordSort --bed perfect.bed | elucidator extendBedRegions --left 5 --right 5 --bed STDIN | bedtools merge | elucidator bed3ToBed6 --bed STDIN  | elucidator filterBedRecordsByLength --bed STDIN  --minLen 65 | elucidator bedGetIntersectingGenesInGff  --gff /work/nhathawa/genomes/pfGenomes_updated/info//gff/Pf3D7.gff --extraAttributes description --overWrite --bed STDIN --out perfectlyShared_minlen65.bed

elucidator getFastaWithBed  --twoBit /tank/data/genomes/plasmodium/genomes/pf/genomes/Pf3D7.2bit  --overWrite --bed perfectlyShared_minlen65.bed --out perfectlyShared_minlen65


elucidator extractRefSeqsFromGenomes --genomeDir /tank/data/genomes/plasmodium/genomes/pf/genomes/ --gffDir /tank/data/genomes/plasmodium/genomes/pf/info/gff/ --primaryGenome Pf3D7 --bed perfectlyShared_minlen65.bed --outputDir extractedRefSeqs --identity 95 --coverage 95 --numThreads 15


```


```{r}
perfectlyShared_minlen65_regions = readr::read_tsv("combinedChrom11_13/chroms11_13_klen31/perfectlyShared_minlen65.bed", col_names = F)
perfectlyShared_minlen65_regions = perfectlyShared_minlen65_regions %>% 
  mutate(X4 = paste0(X4, "-for"))

allRegions = tibble()
for(region in perfectlyShared_minlen65_regions$X4){
  for(strain in strains){
    fnp= paste0("combinedChrom11_13/chroms11_13_klen31/extractedRefSeqs/", region, "/beds/", strain, "_region.bed")
    if(file.exists(fnp)){
      currentRegion = readr::read_tsv(fnp, col_names = F, show_col_types = F)
      currentRegion$X4 = region
      currentRegion$strain = strain
      allRegions = bind_rows(
        allRegions, currentRegion
      )
    }
  }
}

allRegions_sum = allRegions %>% 
  group_by(strain, X1) %>% 
  summarise(start = min(X2), 
         end = max(X3)) %>% 
  filter(grepl("_1[13]", X1)) %>% 
  rename(`#chrom` = X1) %>% 
  mutate(start = start - 5000) %>% 
  mutate(end = end + 5000) %>% 
  mutate(name = paste0(`#chrom`, "-", start, "-", end), 
         len = end - start, 
         strand = "+") 

for(strainName in strains){
  allRegions_sum_byStrain = allRegions_sum %>% 
    filter(strain == strainName)
  allRegions_sum_byStrain = allRegions_sum_byStrain %>% 
    group_by() %>% 
    select(-strain)
  write_tsv(allRegions_sum_byStrain, paste0("combinedChrom11_13/chroms11_13_regions/", strainName, ".bed"))
}




allRegions_sum_exact = allRegions %>% 
  group_by(strain, X1) %>% 
  summarise(start = min(X2), 
         end = max(X3)) %>% 
  filter(grepl("_1[13]", X1)) %>% 
  rename(`#chrom` = X1) %>% 
  mutate(start = start) %>% 
  mutate(end = end) %>% 
  mutate(name = paste0(`#chrom`, "-", start, "-", end), 
         len = end - start, 
         strand = "+") 

for(strainName in strains){
  allRegions_sum_byStrain = allRegions_sum_exact %>% 
    filter(strain == strainName)
  allRegions_sum_byStrain = allRegions_sum_byStrain %>% 
    group_by() %>% 
    select(-strain)
  write_tsv(allRegions_sum_byStrain, paste0("combinedChrom11_13/chroms11_13_regions_exact/", strainName, ".bed"))
}



```


```{bash, eval = F}
cd chroms11_13_regions_exact 

for x in *.bed; do elucidator getFastaWithBed --bed $x --twoBit /tank/data/genomes/plasmodium/genomes/pf/genomes/${x%%.bed}.2bit --overWrite --out ${x%%.bed}.fasta ; done;

mkdir sequences
cd sequences

for x in ../*.fasta; do elucidator splitFile --fasta  $x ; done;

elucidator profileSharedKmerBlocks --seq Pf3D7_11_v3-1913023-1938396.fasta --dout chroms11_13_klen31 --kLen 31 --seqsDir ./ --fasta Pf3D7_11_v3-1913023-1938396.fasta --overWriteDir



```


```{bash, eval = F}
cd chroms11_13_regions

mkdir allSequences
for x in Pf*.bed; do elucidator getFastaWithBed --bed $x --twoBit /tank/data/genomes/plasmodium/genomes/pf/genomes/${x%%.bed}.2bit --overWrite --out ${x%%.bed}.fasta ; done;

cat *.fasta > allSequences/all.fasta
cat Pf*.bed > all.bed
cd allSequences

muscle -in all.fasta -out aln_all.fasta

elucidator trimToLen --fasta aln_all.fasta --length 21945 --overWrite --out trimmed_to_21945_aln_all.fasta
elucidator trimFront --fasta trimmed_to_21945_aln_all.fasta --forwardBases 21845 --overWrite --out trimmedFrom_21845_trimmed_to_21945_aln_all.fasta
elucidator sortReads --fasta trimmedFrom_21845_trimmed_to_21945_aln_all.fasta --removeGaps  --overWrite
elucidator determineRegionLastz --individual --fasta sorted_trimmedFrom_21845_trimmed_to_21945_aln_all.fasta --genome /tank/data/genomes/plasmodium/genomes/pf/genomes/Pf3D7.2bit   -out sorted_trimmedFrom_21845_trimmed_to_21945_aln_all.bed 


elucidator trimToLen --fasta aln_all.fasta --length 6051 --overWrite --out trimmed_to_6051_aln_all.fasta
elucidator trimFront --fasta trimmed_to_6051_aln_all.fasta --forwardBases 5951 --overWrite --out trimmedFrom_5951_trimmed_to_6051_aln_all.fasta
elucidator sortReads --fasta trimmedFrom_5951_trimmed_to_6051_aln_all.fasta --removeGaps  --overWrite
elucidator determineRegionLastz --individual --fasta sorted_trimmedFrom_5951_trimmed_to_6051_aln_all.fasta --genome /tank/data/genomes/plasmodium/genomes/pf/genomes/Pf3D7.2bit --overWrite --out sorted_trimmedFrom_5951_trimmed_to_6051_aln_all.bed 

elucidatorlab getMalnConservedness --fasta aln_all.fasta  --blockSize 10 --overWrite --out aln_all_conserved.tab.txt  
elucidatorlab getAlnPosToRealPosTable --fasta aln_all.fasta --overWrite --out aln_all_position_table.tab.txt  



```



## Beginning of the section 

Pf3D7_11_v3-1918029-1918124--Pf3D7_13_v3-2792022-2792117

::: column-screen-inset-shaded
![](figures/Pf3D7_11_v3-1918029-1918124--Pf3D7_13_v3-2792022-2792117.png){width=70% fig-align="center"}
:::

## Middle 
Variation in the middle, does not appear to be chromosome specific (though could have been a difference in assembly)

::: column-screen-inset-shaded
![](figures/variation_in_middle.png){width=70% fig-align="center"}
:::

## End of the section  

Pf3D7_11_v3-1933291-1933390--Pf3D7_13_v3-2807298-2807397

::: column-screen-inset-shaded
![](figures/Pf3D7_11_v3-1933291-1933390--Pf3D7_13_v3-2807298-2807397.png){width=70% fig-align="center"}
:::
The region is ~15,360 kp long, so could be hard to get through even with long reads 


## Conserved score

Plotting the percent conserved per position across the region including the before and after shared segment, all strains appear to start and end at the same position   


```{r}
conservedPosTab = readr::read_tsv("combinedChrom11_13/chroms11_13_regions/allSequences/aln_all_position_table.tab.txt") %>% 
  filter("Pf3D7_11_v3-1913023-1938396" == name) %>% 
  rename(pos = alignPosition)

conservedMulti = readr::read_tsv("combinedChrom11_13/chroms11_13_regions/allSequences/aln_all_conserved.tab.txt")

conservedMulti_max = conservedMulti %>% 
  group_by(pos) %>% 
  filter(frac == max(frac)) %>% 
  filter("----------" != block ) %>% 
  left_join(conservedPosTab) %>% 
  filter(!is.na(name)) %>% 
  filter()

blockSize = 30

conservedMulti_max_rounding = tibble(rawPos = 1:(nrow(conservedMulti_max) - blockSize),
                                     realPosition = numeric(nrow(conservedMulti_max) - blockSize), 
                                     rounded = numeric(nrow(conservedMulti_max) - blockSize))


for(row in 1:(nrow(conservedMulti_max) - blockSize)){
  round = mean(conservedMulti_max$frac[row:(row+blockSize)])
  if(round > 1){
    print(row)
    print(blockSize)
    print(conservedMulti_max$frac[row:(row+blockSize)])
    print(round)
    stop()
  }
  minPos = min(conservedMulti_max$realPosition[row:(row+blockSize)])
  conservedMulti_max_rounding$realPosition[row] = minPos
  conservedMulti_max_rounding$rounded[row] = round
}

conservedMulti_max_rounding_filt = conservedMulti_max_rounding %>% 
  group_by(realPosition) %>% 
  mutate(id = row_number()) %>% 
  filter(id == 1) %>% 
  group_by()

```

```{r}
#| fig-column: screen-inset-shaded


ggplot(conservedMulti_max_rounding_filt) + 
  geom_bar(aes(x = realPosition, y = rounded), stat = "identity")+
  sofonias_theme

```

```{bash, eval = F}
cd chroms11_13_regions_exact

cat *.fasta > all.fasta 
elucidator compareAllByAll --fasta all.fasta --verbose --numThreads 12 
elucidator createSharedSubSegmentsFromRefSeqs --fasta out.fasta --minimumKlen 12 --dout sharedSubSeqs_occurence6 --filterRegionsWithinRegions --refSeqName Pf3D7_11_v3-1918023-1933396 --overWriteDir --genome /tank/data/genomes/plasmodium/genomes/pf/genomes/Pf3D7.fasta --minSubRegionLen 150 --maxSubRegionLen 300 --minBlockRegionLen 30 --doNotCollapseLowFreqNodes --correctionOccurenceCutOff 6 --includeFromSubRegionSize 30

muscle -in all.fasta -out aln_all.fasta

FastTree -nt aln_all.fasta > aln_all.nwk

elucidatorlab getMalnConservedness --fasta aln_all.fasta  --blockSize 10 --overWrite --out aln_all_conserved.tab.txt  
elucidatorlab getAlnPosToRealPosTable --fasta aln_all.fasta --overWrite --out aln_all_position_table.tab.txt  

elucidator multipleAlnProteinToPcaInput --fasta aln_all.fasta --overWrite --out pcaMat_aln_all.fasta
elucidator runTRF --fasta all.fasta --overWriteDir --dout all_trfOutput --supplement --genomicLocation ../all.bed

elucidator bedFilterRegionsCompletelyInOther --bed all_trfOutput/genomic_combined.bed --intersectWithBed all_trfOutput/genomic_combined.bed | elucidator bedAddSmartIDForPlotting --bed STDIN --out all_trfOutput/filtered_genomic_combined.bed --overWrite 

elucidator countBasesPerPosition --fasta aln_all.fasta --overWrite --out aln_all_baseCounts.tab.txt

#elucidator createSharedSubSegmentsFromRefSeqs --fasta out.fasta --minimumKlen 12 --dout sharedSubSeqs_occurence6 --filterRegionsWithinRegions --refSeqName Pf3D7_11_v3-1918023-1933396 --overWriteDir --genome /tank/data/genomes/plasmodium/genomes/pf/genomes/Pf3D7.fasta --minSubRegionLen 150 --maxSubRegionLen 300 --minBlockRegionLen 30 --doNotCollapseLowFreqNodes --correctionOccurenceCutOff 6 --includeFromSubRegionSize 30

#cd sharedSubSeqs_occurence6/sharedLocs

#elucidator bedGetIntersectingGenesInGff  --gff /tank/data/genomes/plasmodium/genomes/pf/info/gff/Pf3D7.gff --extraAttributes description --overWrite --bed 0_ref_sharedLocs_genomic.bed --out 0_ref_sharedLocs_genomic_withGeneInfo.bed


```

## Comparing chromsome 11 and 13 segments 


The chromosome 11 and 13 regions appear to be extremely closely related to each other and there is no clear variation specific to chromosome 11 or 13 

```{r}
library(ggtree)
tree = ape::read.tree("combinedChrom11_13/chroms11_13_regions_exact/sequences/aln_all.nwk")


tree$tip.label = gsub("_v3", "", gsub("-.*", "", tree$tip.label))

tipDf = tibble(seq = tree$tip.label) %>% 
  mutate(chrom = ifelse(grepl("_11", seq), "chrom11", "chrom13"))
tipDf_chrom11 = tipDf %>% 
  filter(chrom == "chrom11")
tipDf_chrom13 = tipDf %>% 
  filter(chrom == "chrom13")

groups = list(chrom11 = tipDf_chrom11$seq, 
              chrom13 = tipDf_chrom13$seq)

tree = groupOTU(tree, groups)

sharedRegion_pacbioAssembliesPlot = ggtree::ggtree(tree, show.legend = FALSE) + 
  hexpand(.1) + 
  geom_tiplab(aes(color = group), show.legend = FALSE) + 
  scale_color_manual(values = c("#005AC8", "#AA0A3C")) 
```

```{r}
#| fig-column: screen-inset-shaded
#| fig-height: 5
#| fig-width: 10
print(sharedRegion_pacbioAssembliesPlot)
```

```{r}
pdf("sharedRegion_pacbioAssemblies.pdf", width = 7.5, height = 7.5/2, useDingbats = F)
print(sharedRegion_pacbioAssembliesPlot)
dev.off()
```

### Comparing Percent Identity 

All segments between strains are more >99% related to each other
```{r}
allByAllComp = readr::read_tsv("combinedChrom11_13/chroms11_13_regions_exact/sequences/out.tab.txt")

```

```{r, eval=FALSE, echo=FALSE}
#| fig-column: screen-inset-shaded

ggplot(allByAllComp) + 
  geom_tile(aes(x = OtherReadId , y = ReadId, fill = perId)) + 
  geom_text(aes(x = OtherReadId , y = ReadId, label = round(perId*100,2))) + 
  sofonias_theme_xRotate + 
  scale_y_discrete(position = "right") + 
  scale_fill_gradient2(limits = c(min(allByAllComp$perId),1), midpoint = min(allByAllComp$perId) + (max(allByAllComp$perId) - min(allByAllComp$perId))/2 )

```


```{r}
allByAllComp_select = allByAllComp %>% 
  select(OtherReadId, ReadId, perId) %>% 
  spread(ReadId, perId)

addRow =  matrix(c(colnames(allByAllComp_select)[2], rep(NA, ncol(allByAllComp_select) - 1)), nrow = 1)
colnames(addRow) = colnames(allByAllComp_select)
addRow = as_tibble(addRow) 
for(col in 2:ncol(addRow)){
  addRow[,col] = as.numeric(addRow[,col])
}


allByAllComp_select = as_tibble(addRow) %>% bind_rows(allByAllComp_select)


allByAllComp_select_mat = as.matrix(allByAllComp_select[,2:ncol(allByAllComp_select)])
rownames(allByAllComp_select_mat) = allByAllComp_select$OtherReadId

allByAllComp_select_mat = as.matrix(as.dist(allByAllComp_select_mat))
allByAllComp_select_mat[0 == allByAllComp_select_mat] = NA

```

```{r}
library(circlize)
meta = readr::read_tsv("../../meta/metadata/meta.tab.txt") %>% 
  mutate(country = gsub("South East Asia - East", "Cambodia", country))
metaByBioSample = readr::read_tsv("../../meta/metadata/metaByBioSample.tab.txt") %>% 
  mutate(country = gsub("South East Asia - East", "Cambodia", country))



allByAllComp_select_mat_noLabels = allByAllComp_select_mat
# rownames(allByAllComp_select_mat_noLabels) = NULL 
# colnames(allByAllComp_select_mat_noLabels) = NULL


rownames(allByAllComp_select_mat_noLabels) = gsub("Dd2", "DD2", gsub("Pf", "", gsub("_v3", "", gsub("-.*", "", rownames(allByAllComp_select_mat_noLabels)))))
colnames(allByAllComp_select_mat_noLabels) = gsub("Dd2", "DD2", gsub("Pf", "", gsub("_v3", "", gsub("-.*", "", colnames(allByAllComp_select_mat_noLabels)))))


col_fun = colorRamp2(c(
  min(allByAllComp_select_mat_noLabels, na.rm = T),
  min(allByAllComp_select_mat_noLabels, na.rm = T) + (
    max(allByAllComp_select_mat_noLabels, na.rm = T) - min(allByAllComp_select_mat_noLabels, na.rm = T)
  ) / 2,
  max(allByAllComp_select_mat_noLabels, na.rm = T)
),
c("#4575b4", "#ffffbf", "#d73027"))
#c("#ffeda0", "#feb24c", "#f03b20"))



topAnnoDat = tibble(name = colnames(allByAllComp_select_mat)) %>% 
  mutate(sampchrom = gsub("_v3", "", gsub("-.*", "", name))) %>% 
  mutate(sample = gsub("Dd2", "DD2", gsub("_.*", "", gsub("Pf", "", sampchrom)))) %>% 
  mutate(chrom = gsub(".*_", "", sampchrom)) %>% 
  left_join(metaByBioSample %>% 
              select(sample, secondaryRegion))

topAnno_df = topAnnoDat[,c("chrom", "secondaryRegion")] %>% 
  rename(continent = secondaryRegion) %>% 
  mutate(continent = gsub("Netherlands", "AFRICA", continent) ) %>% 
  as.data.frame()

topAnnoColors = createColorListFromDf(topAnno_df)

topAnnoColors[["continent"]] = c("AFRICA"  =   "#E69F00", 
                                 "ASIA" =      "#F0E442", 
                                 "OCEANIA"  =  "#F748A5", 
                                 "S_AMERICA" = "#0072B2", 
                                 "Netherlands"="black")

topAnnoColors[["chrom"]] = c("11" = "#008DF9", "13" = "#A40122")



topAnno = HeatmapAnnotation(
  df = topAnno_df,
  col = topAnnoColors,
  show_legend = F,
  gp = gpar(col = "black")
  # the annotation direction doesn't appear to work right now 
  # annotation_legend_param = list(
  #   chrom = list(legend_direction = "horizontal", direction = "horizontal"), 
  #   continent = list(legend_direction = "horizontal", direction = "horizontal")
  # )
)

sideAnno = rowAnnotation(
  df = topAnno_df,
  col = topAnnoColors,
  show_legend = F,
  gp = gpar(col = "black")
  # ,
  # annotation_legend_param = list(
  #   chrom = list(legend_direction = "horizontal", direction = "horizontal"), 
  #   continent = list(legend_direction = "horizontal", direction = "horizontal")
  # )
)

colnames(allByAllComp_select_mat_noLabels) = gsub("3D7", "**3D7", colnames(allByAllComp_select_mat_noLabels))
rownames(allByAllComp_select_mat_noLabels) = gsub("3D7", "**3D7", rownames(allByAllComp_select_mat_noLabels))

hc = hclust(dist(allByAllComp_select_mat_noLabels))
allByAllComp_select_mat_heatmap = Heatmap(
  allByAllComp_select_mat_noLabels,
  rect_gp = gpar(type = "none"),
  column_dend_side = "bottom",
  cell_fun = function(j, i, x, y, w, h, fill) {
    if (as.numeric(x) <= 1 - as.numeric(y) + 1e-6) {
      grid.rect(x, y, w, h, gp = gpar(fill = fill, col = fill))
    }
  },
  # cluster_rows = hc,
  # cluster_columns = hc,
  row_names_side = "left",
  name = "%Identity",
  col = col_fun,
  bottom_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"), 
  show_column_dend = F, 
  show_heatmap_legend = F
  
  # , 
  # heatmap_legend_param = list(
  #   direction = "horizontal"
  # )
)

chrom_lgd = Legend(labels = sort(unique(topAnno_df$chrom)), title = "chrom", legend_gp = gpar(fill = topAnnoColors[["chrom"]][sort(unique(topAnno_df$chrom))]))
continent_lgd = Legend(labels = sort(unique(topAnno_df$continent)), title = "continent", legend_gp = gpar(fill = topAnnoColors[["continent"]][sort(unique(topAnno_df$continent))]))
heatmap_lgd = Legend(col_fun = col_fun, title = "%Identity")


```


```{r}
#| fig-column: screen-inset-shaded
draw(allByAllComp_select_mat_heatmap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "left")
draw(packLegend(heatmap_lgd,chrom_lgd,continent_lgd,  direction = "horizontal"), x = unit(0.775, "npc"), y = unit(0.85, "npc"))

```


```{r}
pdf("sharedRegion_pacbioAssemblies_percentIdentity.pdf", width = 7.5, height = 5, useDingbats = F)
draw(allByAllComp_select_mat_heatmap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "left")
draw(packLegend(heatmap_lgd,chrom_lgd,continent_lgd,  direction = "horizontal"), x = unit(0.775, "npc"), y = unit(0.85, "npc"))
dev.off()
```

### Comparing Number of variants 

```{r}
allByAllCompVarCount_select = allByAllComp %>% 
  select(OtherReadId, ReadId, totalDiffs) %>% 
  spread(ReadId, totalDiffs)

addRow =  matrix(c(colnames(allByAllCompVarCount_select)[2], rep(NA, ncol(allByAllCompVarCount_select) - 1)), nrow = 1)
colnames(addRow) = colnames(allByAllCompVarCount_select)
addRow = as_tibble(addRow) 
for(col in 2:ncol(addRow)){
  addRow[,col] = as.numeric(addRow[,col])
}


allByAllCompVarCount_select = as_tibble(addRow) %>% bind_rows(allByAllCompVarCount_select)


allByAllCompVarCount_select_mat = as.matrix(allByAllCompVarCount_select[,2:ncol(allByAllCompVarCount_select)])
rownames(allByAllCompVarCount_select_mat) = allByAllCompVarCount_select$OtherReadId

allByAllCompVarCount_select_mat = as.matrix(as.dist(allByAllCompVarCount_select_mat))
allByAllCompVarCount_select_mat[0 == allByAllCompVarCount_select_mat] = NA

```

```{r}
library(circlize)
meta = readr::read_tsv("../../meta/metadata//meta.tab.txt") %>% 
  mutate(country = gsub("South East Asia - East", "Cambodia", country))
metaByBioSample = readr::read_tsv("../../meta/metadata/metaByBioSample.tab.txt") %>% 
  mutate(country = gsub("South East Asia - East", "Cambodia", country))



allByAllCompVarCount_select_mat_noLabels = allByAllCompVarCount_select_mat
# rownames(allByAllCompVarCount_select_mat_noLabels) = NULL 
# colnames(allByAllCompVarCount_select_mat_noLabels) = NULL


rownames(allByAllCompVarCount_select_mat_noLabels) = gsub("Dd2", "DD2", gsub("Pf", "", gsub("_v3", "", gsub("-.*", "", rownames(allByAllCompVarCount_select_mat_noLabels)))))
colnames(allByAllCompVarCount_select_mat_noLabels) = gsub("Dd2", "DD2", gsub("Pf", "", gsub("_v3", "", gsub("-.*", "", colnames(allByAllCompVarCount_select_mat_noLabels)))))


col_fun = colorRamp2(c(
  min(allByAllCompVarCount_select_mat_noLabels, na.rm = T),
  min(allByAllCompVarCount_select_mat_noLabels, na.rm = T) + (
    max(allByAllCompVarCount_select_mat_noLabels, na.rm = T) - min(allByAllCompVarCount_select_mat_noLabels, na.rm = T)
  ) / 2,
  max(allByAllCompVarCount_select_mat_noLabels, na.rm = T)
),
c("#006837", "#ffffbf", "#d73027"))
#c("#4575b4", "#ffffbf", "#d73027"))
#c("#ffeda0", "#feb24c", "#f03b20"))



topAnnoDat = tibble(name = colnames(allByAllCompVarCount_select_mat)) %>% 
  mutate(sampchrom = gsub("_v3", "", gsub("-.*", "", name))) %>% 
  mutate(sample = gsub("Dd2", "DD2", gsub("_.*", "", gsub("Pf", "", sampchrom)))) %>% 
  mutate(chrom = gsub(".*_", "", sampchrom)) %>% 
  left_join(metaByBioSample %>% 
              select(sample, secondaryRegion))

topAnno_df = topAnnoDat[,c("chrom", "secondaryRegion")] %>% 
  rename(continent = secondaryRegion) %>% 
  mutate(continent = gsub("Netherlands", "AFRICA", continent) ) %>% 
  as.data.frame()

topAnnoColors = createColorListFromDf(topAnno_df)

topAnnoColors[["continent"]] = c("AFRICA"  =   "#E69F00", 
                                 "ASIA" =      "#F0E442", 
                                 "OCEANIA"  =  "#F748A5", 
                                 "S_AMERICA" = "#0072B2", 
                                 "Netherlands"="black")

topAnnoColors[["chrom"]] = c("11" = "#008DF9", "13" = "#A40122")



topAnno = HeatmapAnnotation(
  df = topAnno_df,
  col = topAnnoColors,
  show_legend = F,
  gp = gpar(col = "black")
  # the annotation direction doesn't appear to work right now 
  # annotation_legend_param = list(
  #   chrom = list(legend_direction = "horizontal", direction = "horizontal"), 
  #   continent = list(legend_direction = "horizontal", direction = "horizontal")
  # )
)

sideAnno = rowAnnotation(
  df = topAnno_df,
  col = topAnnoColors,
  show_legend = F,
  gp = gpar(col = "black")
  # ,
  # annotation_legend_param = list(
  #   chrom = list(legend_direction = "horizontal", direction = "horizontal"), 
  #   continent = list(legend_direction = "horizontal", direction = "horizontal")
  # )
)

colnames(allByAllCompVarCount_select_mat_noLabels) = gsub("3D7", "**3D7", colnames(allByAllCompVarCount_select_mat_noLabels))
rownames(allByAllCompVarCount_select_mat_noLabels) = gsub("3D7", "**3D7", rownames(allByAllCompVarCount_select_mat_noLabels))

hc = hclust(dist(allByAllCompVarCount_select_mat_noLabels))
allByAllCompVarCount_select_mat_heatmap = Heatmap(
  allByAllCompVarCount_select_mat_noLabels,
  rect_gp = gpar(type = "none"),
  column_dend_side = "bottom",
  cell_fun = function(j, i, x, y, w, h, fill) {
    if (as.numeric(x) <= 1 - as.numeric(y) + 1e-6) {
      grid.rect(x, y, w, h, gp = gpar(fill = fill, col = fill))
    }
  },
  # cluster_rows = hc,
  # cluster_columns = hc,
  row_names_side = "left",
  name = "Number of \nVariants",
  col = col_fun,
  bottom_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"), 
  show_column_dend = F, 
  show_heatmap_legend = F,
  na_col = "grey80",
  # , 
  # heatmap_legend_param = list(
  #   direction = "horizontal"
  # )
)

chrom_lgd = Legend(labels = sort(unique(topAnno_df$chrom)), title = "chrom", legend_gp = gpar(fill = topAnnoColors[["chrom"]][sort(unique(topAnno_df$chrom))]))
continent_lgd = Legend(labels = sort(unique(topAnno_df$continent)), title = "continent", legend_gp = gpar(fill = topAnnoColors[["continent"]][sort(unique(topAnno_df$continent))]))
heatmap_lgd = Legend(col_fun = col_fun, title = "Number of \nVariants")


```


```{r}
#| fig-column: screen-inset-shaded
draw(allByAllCompVarCount_select_mat_heatmap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "left")
draw(packLegend(heatmap_lgd,chrom_lgd,continent_lgd,  direction = "horizontal"), x = unit(0.80, "npc"), y = unit(0.85, "npc"))

```


```{r}
pdf("sharedRegion_pacbioAssemblies_varCount.pdf", width = 7.5, height = 5, useDingbats = F)
draw(allByAllCompVarCount_select_mat_heatmap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "left")
draw(packLegend(heatmap_lgd,chrom_lgd,continent_lgd,  direction = "horizontal"), x = unit(0.80, "npc"), y = unit(0.85, "npc"))
dev.off()
```





```{r, eval = F, echo=F}
library(gridExtra)
conservedInfo_between_11_and_13_sharedRegion = readr::read_tsv("../../MappingOutSurroundingRegions/conservedInfo_between_11_and_13_sharedRegion.tsv")
conservedInfo_current_plot = ggplot(conservedInfo_between_11_and_13_sharedRegion) +
    geom_segment(
      aes(
        x = chrom1Pos_start,
        xend = crhom1Pos_end - 1,
        y = chrom2Pos,
        yend = chrom2Pos + size - 1
        
      ), 
      linewidth = 2.5,
      color = "black"
    )  + 
    geom_rect(
      ymin = conservedInfo_between_11_and_13_sharedRegion$chrom2_section_start[1],
      ymax = conservedInfo_between_11_and_13_sharedRegion$chrom2_section_start[1] + conservedInfo_between_11_and_13_sharedRegion$chrom2_section_len[1], 
      xmin = conservedInfo_between_11_and_13_sharedRegion$chrom1_section_start[1] - conservedInfo_between_11_and_13_sharedRegion$chrom1_section_len[1] * 0.1,
      xmax = conservedInfo_between_11_and_13_sharedRegion$chrom1_section_start[1] - conservedInfo_between_11_and_13_sharedRegion$chrom1_section_len[1] * 0.001, 
      fill = "grey81") + 
    geom_rect(
      ymax = conservedInfo_between_11_and_13_sharedRegion$chrom2_section_start[1] - conservedInfo_between_11_and_13_sharedRegion$chrom2_section_len[1] * 0.001, 
      ymin = conservedInfo_between_11_and_13_sharedRegion$chrom2_section_start[1] - conservedInfo_between_11_and_13_sharedRegion$chrom2_section_len[1] * 0.1, 
      xmin = conservedInfo_between_11_and_13_sharedRegion$chrom1_section_start[1],
      xmax = conservedInfo_between_11_and_13_sharedRegion$chrom1_section_start[1] + conservedInfo_between_11_and_13_sharedRegion$chrom1_section_len[1], 
      fill = "grey81")  +
    geom_rect(data = pf3d7Genes_chrom1, 
              aes(xmin = col.1, xmax = col.2, 
                  ymax = conservedInfo_between_11_and_13_sharedRegion$chrom2_section_start[1] - conservedInfo_between_11_and_13_sharedRegion$chrom2_section_len[1] * 0.001, 
                  ymin = conservedInfo_between_11_and_13_sharedRegion$chrom2_section_start[1] - conservedInfo_between_11_and_13_sharedRegion$chrom2_section_len[1] * 0.1, 
                  fill = description)) + 
    geom_rect(data = pf3d7Genes_chrom2, 
              aes(ymin = col.1, ymax = col.2, 
                  xmax = conservedInfo_between_11_and_13_sharedRegion$chrom1_section_start[1] - conservedInfo_between_11_and_13_sharedRegion$chrom1_section_len[1] * 0.001, 
                  xmin = conservedInfo_between_11_and_13_sharedRegion$chrom1_section_start[1] - conservedInfo_between_11_and_13_sharedRegion$chrom1_section_len[1] * 0.1, 
                  fill = description)) + 
    labs(title = "", 
         x = "3D7 Chr13-2792021-2807295 (bp=15,274)", 
         y = "3D7 Chr11-1918028-1933288 (bp=15,260)", 
         fill = "") + 
  sofonias_theme + 
  coord_equal() + 
    # scale_fill_tableau() + 
  scale_fill_manual(values = c("#2271B2", "#F748A5", "#3DB7E9", "#D55E00")) + 
  guides(fill=guide_legend(ncol = 1,byrow=TRUE)) + 
  theme(legend.position = c(0.3, 0.7), 
        panel.border = element_blank(), 
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black"))



grobHeatmap = grid.grabExpr({
  draw(allByAllComp_select_mat_heatmap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "left")
draw(packLegend(heatmap_lgd,chrom_lgd,continent_lgd,  direction = "horizontal"), x = unit(0.775, "npc"), y = unit(0.85, "npc"))
}) 

grobGgplot = grid.grabExpr({
  print(conservedInfo_current_plot)
})

gl = list(grobGgplot, grobHeatmap)

lay <- rbind(c(1,1,1,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA), 
             c(1,1,1,1,NA,NA,2,2,2,2,2,2,2,2),
             c(1,1,1,1,NA,NA,2,2,2,2,2,2,2,2),
             c(1,1,1,1,NA,NA,2,2,2,2,2,2,2,2),
             c(1,1,1,1,NA,NA,2,2,2,2,2,2,2,2), 
             c(1,1,1,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA))

pdf("test.pdf", width = 10, height = 10, useDingbats = F)
grid.arrange(
  grobs = gl,
  layout_matrix = lay
)
dev.off()

```



```{r}
#| fig-column: screen-inset-shaded
allByAllComp_select_mat_noLabels_mod = allByAllComp_select_mat_noLabels
allByAllComp_select_mat_noLabels_mod = allByAllComp_select_mat_noLabels_mod - min(allByAllComp_select_mat_noLabels_mod, na.rm = T)
allByAllComp_select_mat_noLabels_mod[is.na(allByAllComp_select_mat_noLabels_mod)] = 0
Heatmap3D(allByAllComp_select_mat_noLabels_mod - min(allByAllComp_select_mat_noLabels_mod))
```

## Chromosome vs chromosome percent identity  

Comparison of the chromosomes vs between chromosomes, for chromosome 11 and 13 they are just as closely related to the same chromosome as between the chromosomes, unlike the [chr05 and chr07 chromosomes rRNA loci](../sharedBetween05_and_07/characterizingSharingInPacbio.qmd#chromosome-vs-chromosome-percent-identity) where the same chromosomes are more closely related than between chromosomes. Could be suggestive that chr11 and chr13 are inter-combining more than chr05 and chr07 

```{r}
allByAllComp_mod = allByAllComp %>% 
  group_by(ReadId, OtherReadId) %>% 
  mutate(read1_chrom = ifelse(grepl("_11", ReadId), "chrom11", "chrom13"))%>% 
  mutate(read2_chrom = ifelse(grepl("_11", OtherReadId), "chrom11", "chrom13")) %>% 
  mutate(comparisonCategory = paste0(sort(c(read1_chrom, read2_chrom)), collapse = "--") ) %>% 
  ungroup()

ggplot(allByAllComp_mod) + 
  geom_boxplot(aes(x = comparisonCategory, y = perId)) + 
  sofonias_theme + 
  #scale_y_continuous(limits = c(min(allByAllComp_mod$perId), 1)) + 
  scale_y_continuous(limits = c(0.990, 1))
```

# Writing out region 
```{r}
outBed = tibble(
  `#chrom` = c("Pf3D7_11_v3", "Pf3D7_13_v3"), 
  start = c(1918029, 2792022), 
  end = c(1933390, 2807397), 
  name = c("Pf3D7_11_v3-1918029-1918124--Pf3D7_11_v3-1933291-1933390__Pf3D7_13_v3-2792022-2792117--Pf3D7_13_v3-2807298-2807397", 
           "Pf3D7_11_v3-1918029-1918124--Pf3D7_11_v3-1933291-1933390__Pf3D7_13_v3-2792022-2792117--Pf3D7_13_v3-2807298-2807397")
)

outBed = outBed %>% 
  mutate(len = end -start) %>% 
  mutate(strand = "+")

create_dt(outBed)

write_tsv(outBed, "investigatingChrom11Chrom13/shared_11_13_region.bed")
```



<!-- ## Mapping out shared region  -->


<!-- ```{r} -->

<!-- conservedPosTab = readr::read_tsv("combinedChrom11_13/chroms11_13_regions_exact/sequences/aln_all_position_table.tab.txt") %>%  -->
<!--   separate(name, into = c("chrom", "start", "end"), convert = T, remove = F, sep = "-") %>%  -->
<!--   mutate(realPosition = realPosition + start) -->

<!-- aln_all_baseCounts = readr::read_tsv("combinedChrom11_13/chroms11_13_regions_exact/sequences/aln_all_baseCounts.tab.txt") -->

<!-- aln_all_baseCounts_filt = aln_all_baseCounts %>%  -->
<!--   filter(count > 0, fraction <1) %>%  -->
<!--   group_by(position) %>%  -->
<!--   filter(max(count) < 19) %>%  -->
<!--   filter(!any(base == '-')) %>%  -->
<!--   select(-meanReadLengthRounded) -->

<!-- conservedPosTab_Pf3D7_11_v3 = conservedPosTab %>%  -->
<!--   filter(grepl("Pf3D7_11_v3", name)) %>%  -->
<!--   rename(position = alignPosition) %>%  -->
<!--   left_join(aln_all_baseCounts_filt)%>%  -->
<!--   filter(!is.na(base)) -->

<!-- conservedPosTab_Pf3D7_13_v3 = conservedPosTab %>%  -->
<!--   filter(grepl("Pf3D7_13_v3", name))%>%  -->
<!--   rename(position = alignPosition) %>%  -->
<!--   left_join(aln_all_baseCounts_filt) %>%  -->
<!--   filter(!is.na(base)) -->


<!-- variableSitesInPacbioGenomes = bind_rows( -->
<!--   conservedPosTab_Pf3D7_11_v3,  -->
<!--   conservedPosTab_Pf3D7_13_v3 -->
<!-- ) %>%  -->
<!--   mutate(`#chrom` = chrom) -->



<!-- chroms = readr::read_tsv("../../MappingOutSurroundingRegions/surroundingRegionsMaterials/chromLengths/Pf3D7.txt", col_names = c("chrom", "length")) -->

<!-- chrom11_genes = readr::read_tsv("../../MappingOutSurroundingRegions/surroundingRegionsMaterials//endBeds/split_Pf3D7_chrom11_toEnd_genes.tab.txt", col_names = T) -->
<!-- chrom13_genes = readr::read_tsv("../../MappingOutSurroundingRegions/surroundingRegionsMaterials//endBeds/split_Pf3D7_chrom13_toEnd_genes.tab.txt", col_names = T) -->
<!-- allGenes = bind_rows(chrom11_genes, chrom13_genes) %>%  -->
<!--   rename(chrom = col.0, start = col.1, end = col.2, name = col.3, length = col.4, strand = col.5) %>%  -->
<!--   mutate(rawGeneDescription = description) %>%  -->
<!--   mutate(gene = ifelse(grepl("PF3D7_0831800", description), "HRP II", "other")) %>%  -->
<!--   mutate(gene = ifelse(grepl("PF3D7_1372200", description), "HRP III", gene))  %>%  -->
<!--   mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%  -->
<!--   mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%  -->
<!--   mutate(description = ifelse("membrane associated erythrocyte binding-like protein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%  -->
<!--   mutate(description = ifelse("membrane associated erythrocyte binding-likeprotein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%  -->
<!--   mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description)) %>%  -->
<!--   mutate(description = gsub(",putative", ", putative", description)) %>%  -->
<!--   mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%  -->
<!--   mutate(description = gsub("unknownfunction", "unknown function", description))%>%  -->
<!--   mutate(description = gsub("conserved protein, unknown function", "conserved Plasmodium protein, unknown function", description)) %>%  -->
<!--   mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%  -->
<!--   mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%  -->
<!--   mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description))%>%  -->
<!--   mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%  -->
<!--   mutate(description = gsub("unknownfunction", "unknown function", description))%>%  -->
<!--   mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%  -->
<!--   mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("sporozoite and liver stage tryptophan-rich protein, putative", description), "tryptophan/threonine-rich antigen", description))%>%  -->
<!--   mutate(description = ifelse(grepl("CRA domain-containing protein, putative", description), "conserved Plasmodium protein, unknown function", description))%>%  -->
<!--   mutate(description = gsub(",pseudogene", ", pseudogene", description)) %>%  -->
<!--   mutate(description = gsub("surfaceantigen", "surface antigen", description))  %>%  -->
<!--   mutate(description = gsub("Tetratricopeptide repeat, putative", "tetratricopeptide repeat protein, putative", description)) %>%  -->
<!--   mutate(description = gsub("transmembraneprotein", "transmembrane protein", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("PfEMP1", description) & grepl("pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%  -->
<!--   mutate(description = gsub("PIR protein", "stevor", description)) %>%  -->
<!--   mutate(description = gsub("erythrocyte membrane protein 1-like", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%  -->
<!--   mutate(description = gsub("acidic terminal segments, variant surface antigen of PfEMP1, putative", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description))%>%  -->
<!--   mutate(description = ifelse(grepl("CoA binding protein", description, ignore.case = T), "acyl-CoA binding protein", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("transfer RNA", description) | grepl("tRNA", description), "tRNA", description))%>%  -->
<!--   mutate(description = ifelse(grepl("cytoadherence", description), "CLAG", description))%>%  -->
<!--   mutate(description = ifelse(grepl("surface-associated interspersed protein", description), "SURFIN", description))%>%  -->
<!--   mutate(description = ifelse(grepl("SURFIN", description), "SURFIN", description))%>%  -->
<!--   mutate(description = ifelse(grepl("stevor-like", description), "stevor, pseudogene", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("exported protein family", description), "exported protein family", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("ribosomal RNA", description), "rRNA", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("serine/threonine protein kinase", description), "serine/threonine protein kinase, FIKK family", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("hypothetical protein", description), "hypothetical protein, conserved", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("conserved Plasmodium protein, unknown function", description), "hypothetical protein, conserved", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("Rifin/stevor family, putative", description), "stevor", description))  %>%  -->
<!--   mutate(description = ifelse(grepl("stevor", description), "stevor", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("rifin", description), "rifin", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("erythrocyte membrane protein 1 (PfEMP1), pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("erythrocyte membrane protein 1", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("probably protein", description), "unspecified product", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("RESA", description), "RESA", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("ring-infected erythrocyte surface antigen", description), "ring-infected erythrocyte surface antigen", description))%>%  -->
<!--   mutate(description = ifelse(grepl("Duffy binding domain/Erythrocyte binding antigen175, putative", description), "erythrocyte binding like protein 1", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("erythrocyte binding like protein 1", description), "erythrocyte binding like protein 1", description))%>%  -->
<!--   mutate(description = ifelse(grepl("unspecified product", description), "hypothetical protein, conserved", description))%>%  -->
<!--   mutate(description = ifelse(grepl("probable protein, unknown function", description), "hypothetical protein, conserved", description)) %>%  -->
<!--   mutate(description = ifelse(grepl("RESA", description), "ring-infected erythrocyte surface antigen", description)) -->


<!-- descriptionColorsNames = unique(c(allGenes$description)) -->
<!-- descriptionColors = scheme$hex(length(descriptionColorsNames)) -->
<!-- names(descriptionColors) = descriptionColorsNames -->


<!-- renamed_combined_sharedRegionsNotInTandems = readr::read_tsv("../../sharedBetween11_and_13/windows/renamed_combined_sharedRegionsNotInTandems.bed", col_names = F) -->
<!-- combined_11_13_sharedRegions_stepped = readr::read_tsv("../../sharedBetween11_and_13/windows/combined_11_13_sharedRegions_stepped.bed", col_names = F); -->
<!-- inbetweenConservedRegions = readr::read_tsv("../../sharedBetween11_and_13/windows/inbetweenConservedRegions.bed", col_names = F); -->
<!-- sharedRegionsNotInTandems_size150_step25 = readr::read_tsv("../../sharedBetween11_and_13/windows/sharedRegionsNotInTandems_size150_step25.bed", col_names = F); -->
<!-- bedsOfSharedRegions = readr::read_tsv("../../sharedBetween11_and_13/bedsOfSharedRegions/Pf3D7.bed", col_names = T); -->
<!-- inbetweenLargeTandems = readr::read_tsv("../../windows/inbetweenLargeTandems.bed", col_names = F); -->




<!-- finalWindows_inSharedRegion = readr::read_tsv("../windowAnalysis/windows/finalHRPII_HRPIII_windows.bed", col_names = F) %>%  -->
<!--   left_join(outBed %>%  -->
<!--               select(1:3) %>%  -->
<!--               rename(X1 = `#chrom`,  -->
<!--                      sharedStart = start,  -->
<!--                      sharedEnd = end)) %>%  -->
<!--   filter((X2 >= sharedStart & X2 < sharedEnd) | (X3 >= sharedStart & X3 <= sharedEnd)) -->

<!-- finalSubWindows_inSharedRegion = readr::read_tsv("../windowAnalysis/windows/finalHRPII_HRPIII_windows_combinedVarConservedRegions.bed", col_names = F) %>%  -->
<!--   left_join(outBed %>%  -->
<!--               select(1:3) %>%  -->
<!--               rename(X1 = `#chrom`,  -->
<!--                      sharedStart = start,  -->
<!--                      sharedEnd = end)) %>%  -->
<!--   filter((X2 >= sharedStart & X2 < sharedEnd) | (X3 >= sharedStart & X3 <= sharedEnd)) -->


<!-- allGenes_inSharedRegion = allGenes %>%  -->
<!--   left_join(outBed %>%  -->
<!--               select(1:3) %>%  -->
<!--               rename(chrom = `#chrom`,  -->
<!--                      sharedStart = start,  -->
<!--                      sharedEnd = end)) %>%  -->
<!--   filter((start >= sharedStart & start < sharedEnd) | (end >= sharedStart & end <= sharedEnd)) %>%  -->
<!--   group_by(chrom, start, end, name, length) %>%  -->
<!--   mutate(start = max(start, sharedStart),  -->
<!--          end = min(end, sharedEnd)) -->



<!-- genomic_combined_filtered = readr::read_tsv("chroms11_13_regions_exact/sequences/all_trfOutput/filtered_genomic_combined.bed", col_names = F) -->

<!-- genomic_combined_filtered_3d7 = genomic_combined_filtered %>%  -->
<!--   filter(X1 %in% c("Pf3D7_11_v3", "Pf3D7_13_v3")) -->
<!-- colnames(genomic_combined_filtered_3d7)[1:length(colnames(outBed))] = colnames(outBed) -->
<!-- colnames(finalWindows_inSharedRegion)[1:length(colnames(outBed))] = colnames(outBed) -->
<!-- colnames(finalSubWindows_inSharedRegion)[1:length(colnames(outBed))] = colnames(outBed) -->
<!-- colnames(allGenes_inSharedRegion)[1:length(colnames(outBed))] = colnames(outBed) -->
<!-- colnames(renamed_combined_sharedRegionsNotInTandems)[1:length(colnames(outBed))] = colnames(outBed) -->
<!-- colnames(combined_11_13_sharedRegions_stepped)[1:length(colnames(outBed))] = colnames(outBed) -->
<!-- colnames(inbetweenConservedRegions)[1:length(colnames(outBed))] = colnames(outBed) -->
<!-- colnames(sharedRegionsNotInTandems_size150_step25)[1:length(colnames(outBed))] = colnames(outBed) -->
<!-- colnames(bedsOfSharedRegions)[1:length(colnames(outBed))] = colnames(outBed) -->
<!-- colnames(inbetweenLargeTandems)[1:length(colnames(outBed))] = colnames(outBed) -->



<!-- inbetweenLargeTandems_inSharedRegion = inbetweenLargeTandems%>%  -->
<!--   left_join(outBed %>%  -->
<!--               select(1:3) %>%  -->
<!--               rename( -->
<!--                      sharedStart = start,  -->
<!--                      sharedEnd = end)) %>%  -->
<!--   filter((start >= sharedStart & start < sharedEnd) | (end >= sharedStart & end <= sharedEnd)) %>%  -->
<!--   group_by(`#chrom`, start, end, name, len) %>%  -->
<!--   mutate(start = max(start, sharedStart),  -->
<!--          end = min(end, sharedEnd)) %>%  -->
<!--   ungroup() -->


<!-- # temp = readr::read_tsv("tunning/shared_11_13_region_trfOutput/onlymapped_nochr11Locs_windowsAcrossDuplicatedRegion_chr11_step20_size200_id90.bed", col_names = F) %>%  -->
<!-- #   group_by(X4) %>%  -->
<!-- #   count() %>%  -->
<!-- #   ungroup() %>%  -->
<!-- #   separate(X4, remove = F, convert = T, sep = "-", into = c("chrom", "start", "end", "strand")) %>%  -->
<!-- #   mutate("#chrom" = chrom) -->

<!-- # temp = readr::read_tsv("outFilt_id80.tab.txt", col_names = T)  %>% -->
<!-- #   mutate("#chrom" = chrom) -->
<!-- #  -->
<!-- # temp2 = readr::read_tsv("outFilt_id90.tab.txt", col_names = T)  %>% -->
<!-- #   mutate("#chrom" = chrom) -->

<!-- temp = readr::read_tsv("filtered_id80_chr11_chr13_inDuplicatedArea_withGeneInfo.bed", col_names = F) -->
<!-- colnames(temp)[1:length(colnames(outBed))] = colnames(outBed) -->

<!-- regions_withSD01MultiInShared = readr::read_tsv("../windowAnalysis/regions_withSD01MultiInShared.bed") -->
<!-- new_regions_withSD01MultiInShared = readr::read_tsv("/Users/nick/projects/plasmodium/falciparum/PathWeaverAnalys_2020_12/chr11_chr13_inDuplicatedArea/PfSD01_chr11_chr13_inDuplicatedArea/final/hold.bed", col_names = F) -->
<!-- colnames(new_regions_withSD01MultiInShared)[1:length(colnames(outBed))] = colnames(outBed) -->

<!-- firstPassSuccess = readr::read_tsv("chrom11_13DuplicatedRegion_successfulFirstPass.bed", col_names = F) -->
<!-- colnames(firstPassSuccess)[1:length(colnames(outBed))] = colnames(outBed) -->


<!-- varWindowFromFirstPassSuccess = readr::read_tsv("finalTunned_duplicatedRegionChr11Chr13_combinedVarConservedRegions.bed", col_names = F) -->
<!-- colnames(varWindowFromFirstPassSuccess)[1:length(colnames(outBed))] = colnames(outBed) -->



<!-- ``` -->

<!-- ```{r} -->
<!-- #| fig-column: screen-inset-shaded -->

<!-- overlayPlot = ggplot() + -->
<!--   geom_vline(aes(xintercept = realPosition), -->
<!--              data = variableSitesInPacbioGenomes,  -->
<!--              color = "red") +  -->
<!--   geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = 0.5, ymax = 1),  -->
<!--             data = outBed) +  -->
<!--   geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = 1, ymax = 2),  -->
<!--             data = genomic_combined_filtered_3d7) +  -->

<!--     geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = 2, ymax = 3), fill = "maroon",  -->
<!--             data = inbetweenLargeTandems_inSharedRegion) +  -->
<!--     geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = 3, ymax = 4), fill = "darkblue",  -->
<!--             data = temp ) +  -->
<!--     geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = 3.75, ymax = 4), fill = "#00FCCF",  -->
<!--             data = firstPassSuccess )  +  -->



<!--     # geom_rect(aes(xmin = start, xmax = end,  -->
<!--     #             ymin = 4, ymax = 5), fill = "darkblue",  -->
<!--     #         data = temp2 ) +  -->
<!--     geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = 3, ymax = 3.5), fill = "limegreen",  -->
<!--             data = regions_withSD01MultiInShared ) +  -->
<!--     geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = 3, ymax = 3.25), fill = "forestgreen",  -->
<!--             data = new_regions_withSD01MultiInShared ) +  -->


<!--   geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = -1, ymax = -0.5),  -->
<!--             data = finalWindows_inSharedRegion) +  -->
<!--   geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = -0.5, ymax = 0), fill = "orange",  -->
<!--             data = finalSubWindows_inSharedRegion) +  -->
<!--   geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = 4, ymax = 5), fill = "orange",  -->
<!--             data = finalSubWindows_inSharedRegion) +  -->
<!--   geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = 0, ymax = 1,  -->
<!--                 fill = description), -->
<!--             data = allGenes_inSharedRegion) +  -->
<!--   geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = -0.75, ymax = -0.25), fill = "blue",  -->
<!--             data = combined_11_13_sharedRegions_stepped)+  -->
<!--   geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = -0.75, ymax = -0.5), fill = "red",  -->
<!--             data = renamed_combined_sharedRegionsNotInTandems)+  -->
<!--   geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = -1, ymax = -2), fill = "purple",  -->
<!--             data = inbetweenConservedRegions) +  -->

<!--     geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = -2.5, ymax = -2), fill = "lightblue",  -->
<!--             data = bedsOfSharedRegions)+  -->

<!--     geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = -3, ymax = -2.5), fill = "magenta",  -->
<!--             data = sharedRegionsNotInTandems_size150_step25) +  -->
<!--     geom_rect(aes(xmin = start, xmax = end,  -->
<!--                 ymin = 4, ymax = 4.25), fill = "#000000",  -->
<!--             data = varWindowFromFirstPassSuccess )+  -->



<!--   scale_fill_manual(values = descriptionColors) +  -->
<!--   facet_wrap(~`#chrom`, scales = "free", ncol = 1) +  -->
<!--   sofonias_theme -->
<!-- print(overlayPlot) -->


<!-- ``` -->

<!-- ```{r} -->
<!-- #| column: screen-inset-shaded -->


<!-- ggplotly(overlayPlot) -->
<!-- ``` -->