Plotting haplotype variation within regions

Reading in data

Code

load("rowAnnoColors.Rdata")

Downloads

meta.tab.txt metaByBioSample real_mccoil_COI_calls.tsv slim_allSelectedClustersInfo.tab.txt.gz metaSelected.tab.txt allMeta_HRP2_HRP3_deletionCalls.tab.txt subwindows_regionMeta.tab.txt

Code

meta = readr::read_tsv("../meta/metadata/meta.tab.txt") %>% 
  mutate(country = gsub("South East Asia - East", "Cambodia", country))
metaByBioSample = readr::read_tsv("../meta/metadata/metaByBioSample.tab.txt") %>% 
  mutate(country = gsub("South East Asia - East", "Cambodia", country))

# coiCalls = readr::read_tsv("/Users/nick/Dropbox (Personal)/ownCloud/documents/plasmodium/falciparum/pfepipanels/Pf_Epi_Panels/data/MAD4HATTER/data/pf/COI_calls.tab.txt")
# coiCalls_poly = coiCalls %>% 
#   filter(COI > 1)

coiCalls = readr::read_tsv("heome1_COI_calls.tab.txt")
#coiCalls = readr::read_tsv("PfSMART_COI_calls.tab.txt")

coiCalls_poly = coiCalls %>%
  filter(COI > 1)

realmccoilCoiCalls = readr::read_tsv("wgs_variants/THEREALMcCOIL/categorical_method/real_mccoil_COI_calls.tsv")

realmccoilCoiCalls_poly = realmccoilCoiCalls %>% 
  filter(random_median != 1 | topHE_median != 1)

previousDeletionCalls = readr::read_tsv("allMeta_HRP2_HRP3_deletionCalls.tab.txt") %>% 
  #filter(country %!in% c("Bangladesh", "Mauritania", "Myanmar", "The Gambia")) %>% 
  #filter(((grepl("SPT", sample) & possiblyChr11Deleted))) %>% 
  #filter(BiologicalSample %!in% coiCalls_poly$sample) %>% 
  mutate(country = gsub("South East Asia - East", "Cambodia", country))

meta = meta %>% 
  left_join(previousDeletionCalls)%>% 
  mutate(hrpCall = case_when(
    possiblyHRP2Deleted & possiblyHRP3Deleted ~ "pfhrp2-/pfhrp3-", 
    possiblyHRP2Deleted & !possiblyHRP3Deleted ~ "pfhrp2-/pfhrp3+", 
    !possiblyHRP2Deleted & possiblyHRP3Deleted ~ "pfhrp2+/pfhrp3-", 
    T ~ "pfhrp2+/pfhrp3+"
  ))  %>% 
  left_join(realmccoilCoiCalls %>% 
              select(BiologicalSample, topHE_median) %>% 
              rename(COI = topHE_median))
  # left_join(coiCalls %>% 
  #             rename(BiologicalSample = sample))

homologousRegion = readr::read_tsv("../rRNA_segmental_duplications/sharedBetween11_and_13/investigatingChrom11Chrom13/Pf3D7_13_v3-2792021-2807295-for--Pf3D7_11_v3-1918028-1933288-for.bed", 
                               col_names = F)

regions = readr::read_tsv("subwindows_regionMeta.tab.txt")

metaSelected = readr::read_tsv("metaSelected.tab.txt")  %>% 
  #select(-COI) %>% 
  left_join(realmccoilCoiCalls %>% 
              select(BiologicalSample, topHE_median) %>% 
              rename(COI = topHE_median)) %>% 
  filter(COI == 1)
metaSelected_hrp2_deleted = metaSelected %>% filter(possiblyHRP2Deleted)
metaSelected_hrp3_deleted = metaSelected %>% filter(possiblyHRP3Deleted)
metaSelected_hrp2_and_hrp3_deleted = metaSelected %>% filter(possiblyHRP2Deleted, possiblyHRP3Deleted)

regions_key = regions %>% 
  select(name, genomicID)

Code

finalHrpSubwindows = readr::read_tsv("../windowAnalysis/windows/finalHRPII_HRPIII_windows_withTuned_combinedVarConservedRegions.bed", col_names = F)

erroneousRegions = c("Pf3D7_11_v3-1944071-1944237", "Pf3D7_11_v3-1944083-1944229", "Pf3D7_11_v3-1938175-1938354")

samplesCovered = readr::read_tsv("samplesCovered.txt", col_names = "sample") %>% 
  left_join(meta %>% 
              select(sample, BiologicalSample))

Code

popClustering = readr::read_tsv("finalHRPII_HRPIII_windows_withTunedSubWindows/popClustering/reports/slim_allSelectedClustersInfo.tab.txt.gz")


#  
regions_key = regions_key %>% 
  mutate(duplicationRegion = grepl("for", name))

# renaming and duplicate the dup region 
popClustering = popClustering %>% 
  left_join(regions_key %>% 
              rename(p_name = name)) %>% 
  mutate(p_name = genomicID) %>% 
  mutate(h_popUID = paste0(genomicID, "--", h_popUID))

#  

popClustering_filt = popClustering %>% 
  filter(s_Sample %fin% metaSelected$BiologicalSample) %>% 
  filter(genomicID %!in% erroneousRegions)
previousDeletionCalls_hrp3Delete = previousDeletionCalls %>% 
  filter(possiblyHRP3Deleted)
popClustering_filt_hrp3Delete = popClustering_filt %>% 
  filter(s_Sample %in% previousDeletionCalls_hrp3Delete$BiologicalSample) 

regions_afterHomologous = regions %>% 
  filter(afterHomologousRegion)

Code

allDeletionTypeMeta = readr::read_tsv("allMetaDeletionCalls.tab.txt") %>% 
  filter(BiologicalSample %in% metaSelected$BiologicalSample)

allDeletionTypeMeta_hrp3_pat1 = allDeletionTypeMeta %>% 
  filter(HRP3_deletionPattern == "Pattern 1")

popClustering_filt_hrp3_pat1 = popClustering_filt %>% 
  filter(s_Sample %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample)


allDeletionTypeMeta_hrp3_pat2 = allDeletionTypeMeta %>% 
  filter(HRP3_deletionPattern == "Pattern 2")

popClustering_filt_hrp3_pat2 = popClustering_filt %>% 
  filter(s_Sample %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample)


allDeletionTypeMeta_deletionPatternCounts = allDeletionTypeMeta %>% 
  filter(!is.na(HRP3_deletionPattern)) %>% 
  group_by(HRP3_deletionPattern) %>% 
  count()

create_dt(allDeletionTypeMeta_deletionPatternCounts)

Show entries

Search:

	HRP3_deletionPattern	n
1	Pattern 1	120
2	Pattern 2	48

Showing 1 to 2 of 2 entries

Previous1Next

Pattern 2

Code

allDeletionTypeMeta_hrp3_pat2_count_country = allDeletionTypeMeta_hrp3_pat2 %>% 
  group_by(country, region, secondaryRegion) %>% 
  count()
create_dt(allDeletionTypeMeta_hrp3_pat2_count_country)

Show entries

Search:

	country	region	secondaryRegion	n
				1 35
1	Cambodia	South East Asia - East	ASIA	35
2	Ghana	West Africa	AFRICA	1
3	Kenya	East Africa	AFRICA	1
4	Laos	South East Asia - East	ASIA	1
5	Malawi	South East Africa	AFRICA	1
6	Thailand	South East Asia - West	ASIA	2
7	Vietnam	South East Asia - East	ASIA	7

Showing 1 to 7 of 7 entries

Previous1Next

Code

allDeletionTypeMeta_hrp3_pat2_count_region = allDeletionTypeMeta_hrp3_pat2 %>% 
  group_by(region, secondaryRegion) %>% 
  count()
create_dt(allDeletionTypeMeta_hrp3_pat2_count_region)

Show entries

Search:

	region	secondaryRegion	n
			1 43
1	East Africa	AFRICA	1
2	South East Africa	AFRICA	1
3	South East Asia - East	ASIA	43
4	South East Asia - West	ASIA	2
5	West Africa	AFRICA	1

Showing 1 to 5 of 5 entries

Previous1Next

Code

allDeletionTypeMeta_hrp3_pat2_count_continent = allDeletionTypeMeta_hrp3_pat2 %>% 
  group_by(secondaryRegion) %>% 
  count()
create_dt(allDeletionTypeMeta_hrp3_pat2_count_continent)

Show entries

Search:

	secondaryRegion	n
1	AFRICA	3
2	ASIA	45

Showing 1 to 2 of 2 entries

Previous1Next

13-11++

Chr 11 duplicated region

Getting chr 11 duplication conserved counts

Below is code determining the samples with possible chr11 fragment duplication and breaking down the counts of perfect duplicated copies vs divergent copies.

Code

regions_afterHomologous_chr11 = regions %>% 
  filter(`#chrom` == "Pf3D7_11_v3", 
         afterHomologousRegion)


regions_afterHomologous_chr11 = regions_afterHomologous_chr11 %>% 
  mutate(description = case_when(
    grepl("extraField0=NA", extraField0) ~ "intergenic", 
    T ~ gsub("\\]", "", gsub(".*description=", "", extraField0))
  )  )

descriptionColors = scheme$hex(length(regions_afterHomologous_chr11$description %>% unique()))
names(descriptionColors) = regions_afterHomologous_chr11$description %>% unique()
descriptionColors["intergenic"] = c("#FF000000")

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 = popClustering_filt_hrp3_pat1 %>% 
  filter(p_name %in% regions_afterHomologous_chr11$genomicID)


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% 
  group_by(s_Sample, p_name) %>% 
  mutate(uniqHaps= n())

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_uniqSum = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% 
  group_by(s_Sample) %>% 
  mutate(targets = length(unique(genomicID))) %>% 
  group_by(s_Sample, targets, uniqHaps) %>% 
  count() %>% 
  mutate(freq = n/targets)

minafCutoff = 0.15
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% 
  filter(c_AveragedFrac >  minafCutoff) %>% 
  group_by(s_Sample, p_name) %>% 
  mutate(uniqHaps= n()) %>% 
  mutate(marker = uniqHaps == 1) %>% 
  group_by(s_Sample) %>% 
  summarise(conserved = sum(marker), 
            targets = length(unique(genomicID))) %>% 
  mutate(conservedID = conserved/targets)

conservedCutOff = 0.99

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>%
  filter(conservedID > conservedCutOff)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOff = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>% 
  mutate(marker = conservedID > conservedCutOff) %>% 
  group_by() %>% 
  summarise(perfectDuplication = sum(marker), 
            totalSamps = length(unique(s_Sample))) %>% 
  mutate(perfectCopyFreq = perfectDuplication/totalSamps)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>% 
  mutate(marker = conservedID > conservedCutOff) %>% 
  left_join(metaByBioSample %>% 
              rename(s_Sample = sample)) %>% 
  group_by(country, region) %>% 
  summarise(perfectDuplication = sum(marker), 
            totalSamps = length(unique(s_Sample))) %>% 
  mutate(perfectCopyFreq = perfectDuplication/totalSamps)


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByRegion = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>% 
  mutate(marker = conservedID > conservedCutOff) %>% 
  left_join(metaByBioSample %>% 
              rename(s_Sample = sample)) %>% 
  group_by(region) %>% 
  summarise(perfectDuplication = sum(marker), 
            totalSamps = length(unique(s_Sample))) %>% 
  mutate(perfectCopyFreq = perfectDuplication/totalSamps)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByContinent = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>% 
  mutate(marker = conservedID > conservedCutOff) %>% 
  left_join(metaByBioSample %>% 
              rename(s_Sample = sample)) %>% 
  group_by(secondaryRegion) %>% 
  summarise(perfectDuplication = sum(marker), 
            totalSamps = length(unique(s_Sample))) %>% 
  mutate(perfectCopyFreq = perfectDuplication/totalSamps)

The number of samples with perfect copies

Code

create_dt(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOff)

Show entries

Search:

	perfectDuplication	totalSamps	perfectCopyFreq
	0 1	0 1	0.0 1.0
1	96	120	0.8

Showing 1 to 1 of 1 entries

Previous1Next

The number of samples with perfect copies broken down by country

Code

create_dt(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByCountry)

Show entries

Search:

	country	region	perfectDuplication	totalSamps	perfectCopyFreq
			0 39	1 40	0.000000000000000 1.000000000000000
1	Brazil	South America - Central	14	14	1
2	Cambodia	South East Asia - East	0	1	0
3	Colombia	South America - North	39	40	0.975
4	El Salvador	South America - North	1	1	1
5	Ethiopia	East Africa	9	21	0.4285714285714285
6	French Guiana	South America - Central	4	4	1
7	Ghana	West Africa	1	3	0.3333333333333333
8	Honduras	South America - North	0	1	0
9	Laos	South East Asia - East	2	2	1
10	Mali	West Africa	1	2	0.5

Showing 1 to 10 of 17 entries

Previous1 2Next

The number of samples with perfect copies broken down by regions

Code

create_dt(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByRegion)

Show entries

Search:

	region	perfectDuplication	totalSamps	perfectCopyFreq
		2 40	2 42	0.437500000000000 1.000000000000000
1	East Africa	11	23	0.4782608695652174
2	South America - Central	33	33	1
3	South America - North	40	42	0.9523809523809523
4	South East Asia - East	3	4	0.75
5	South East Asia - West	2	2	1
6	West Africa	7	16	0.4375

Showing 1 to 6 of 6 entries

Previous1Next

The number of samples with perfect copies broken down by continent.

Code

create_dt(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByContinent)

Show entries

Search:

	secondaryRegion	perfectDuplication	totalSamps	perfectCopyFreq
		5 73	6 75	0.461538461538461 0.973333333333334
1	AFRICA	18	39	0.4615384615384616
2	ASIA	5	6	0.8333333333333334
3	S_AMERICA	73	75	0.9733333333333334

Showing 1 to 3 of 3 entries

Previous1Next

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_meanId = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>% 
  filter(conservedID <= conservedCutOff) %>% 
  summarise(meanID = mean(conservedID), 
            minID = min(conservedID), 
            sdID = sd(conservedID))

The breakdown of level of divergence in the samples with divergent samples.

Code

create_dt(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_meanId)

Show entries

Search:

	meanID	minID	sdID
	0.901680801662235 0.901680801662236	0.832041343669250 0.832041343669251	0.048522227604741 0.048522227604742
1	0.9016808016622356	0.8320413436692506	0.04852222760474144

Showing 1 to 1 of 1 entries

Previous1Next

Population analysis of chr11 duplicated region

Calculating the population of the haplotypes after the shared region on chr 11, the duplicated region to see if there is any population signal associated with the duplicated copy. E.g. if the copy is unique to a subset of haplotypes, if the copy is always perfect or if there is variation.

Code

popClustering_filt_regions_afterHomologous_chr11 = popClustering %>% 
  filter(genomicID %!in% erroneousRegions)  %>% 
  filter(p_name %in% regions_afterHomologous_chr11$genomicID)

popClustering_filt_regions_afterHomologous_chr11_tarCounts = popClustering_filt_regions_afterHomologous_chr11 %>% 
  group_by(s_Sample) %>% 
  summarise(tarCounts = length(unique(p_name)))

popClustering_filt_regions_afterHomologous_chr11_tarCounts_filt = popClustering_filt_regions_afterHomologous_chr11_tarCounts %>% 
  filter(tarCounts >= 0.80 * max(tarCounts) | 
           s_Sample %in% previousDeletionCalls$BiologicalSample)

popClustering_filt_regions_afterHomologous_chr11_sampCounts = popClustering_filt_regions_afterHomologous_chr11 %>% 
  group_by(p_name) %>% 
  summarise(sampCounts = length(unique(s_Sample)))

metaByBioSample_out = metaByBioSample %>% 
  left_join(allDeletionTypeMeta %>% 
              select(-sample, -ExperimentSample) %>% 
              rename(sample = BiologicalSample))
write_tsv(metaByBioSample_out, "metaByBioSample_outwithHrpCalls.tab.txt")

write_tsv(popClustering_filt_regions_afterHomologous_chr11 %>%
            filter(s_Sample %in% popClustering_filt_regions_afterHomologous_chr11_tarCounts_filt$s_Sample) %>% 
            group_by() %>% 
            select(s_Sample, p_name, h_popUID, c_AveragedFrac), 
          "popClustering_filt_regions_afterHomologous_chr11.tab.txt.gz")

Code

elucidator doPairwiseComparisonOnHapsSharingDev --tableFnp popClustering_filt_regions_afterHomologous_chr11.tab.txt.gz --sampleCol s_Sample --targetNameCol p_name --popIDCol h_popUID --relAbundCol c_AveragedFrac --numThreads 14 --dout pairwiseComps_regions_afterHomologous_chr11 --verbose --overWriteDir --metaFnp metaByBioSample_outwithHrpCalls.tab.txt --metaFieldsToCalcPopDiffs country,region,secondaryRegion,HRP3_deletionPattern --writeOutDistMatrices

Code

#jacardDist = readr::read_tsv("pairwiseComps_regions_afterHomologous_chr11/percOfTarSharingAtLeastOneHap.tab.txt.gz", col_names = F)
jacardDist = readr::read_tsv("pairwiseComps_regions_afterHomologous_chr11/jacardByHapsTarShared.tab.txt.gz", col_names = F)

jacardDistSamps = readr::read_tsv("pairwiseComps_regions_afterHomologous_chr11/sampleNames.tab.txt", col_names = "samples")
colnames(jacardDist) = jacardDistSamps$samples
jacardDist$sample = jacardDistSamps$samples


# jacardDist_filt = jacardDist[jacardDist$sample  %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample,c(allDeletionTypeMeta_hrp3_pat1$BiologicalSample, "sample")]
# jacardDist_gat = jacardDist_filt %>% 
#   gather(otherSample, index,1:(ncol(.) - 1))

jacardDist_gat = jacardDist %>%
  gather(otherSample, index,1:(ncol(.) - 1))


jacardDist_gat_filt = jacardDist_gat %>% 
  filter(sample %fin% allDeletionTypeMeta_hrp3_pat1$BiologicalSample, 
         otherSample %fin% allDeletionTypeMeta_hrp3_pat1$BiologicalSample)

jacardDist_gat_filt_sp = jacardDist_gat_filt %>% 
  spread(otherSample, index)

jacardDist_gat_filt_sp_mat = as.matrix(jacardDist_gat_filt_sp[,2:ncol(jacardDist_gat_filt_sp)])
rownames(jacardDist_gat_filt_sp_mat) = jacardDist_gat_filt_sp$sample

Getting cluster groups

Code

jacardDist_gat_filt_sp_mat_pat1 = jacardDist_gat_filt_sp_mat

# get data just for variable regions (e.g., minPopSize = 2)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11, minPopSize = 2)

# cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering %>% 
  group_by(p_name) %>% 
  mutate(sampleCount = length(unique(s_Sample))) %>% 
  group_by() %>% 
  filter(sampleCount >= 0.99*max(sampleCount)) %>% # get just targets with high sample coverage otherwise clustering will be by missingness 
  group_by(s_Sample, p_name) %>% 
  #filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_mat)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist)

# get clustering based on the jacard distance too for reference 
jacardDist_gat_filt_sp_mat_pat1_hc = hclust(dist(jacardDist_gat_filt_sp_mat_pat1))

k_groups = 42;
h_groups = 1.1;

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups = cutree(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust, k = k_groups)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend <- as.dendrogram(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend <- color_labels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend, k = k_groups)
plot(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend)

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups = cutree(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust, h = h_groups)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend <- as.dendrogram(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend <- color_labels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend, h = h_groups)

plot(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend)

Code

pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend.pdf", height = 10, width = 20, useDingbats = F)
plot(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend)
dev.off()

quartz_off_screen 
                2

Code

jacardDist_gat_filt_sp_mat_pat1_hc_groups = cutree(jacardDist_gat_filt_sp_mat_pat1_hc, k = k_groups)
jacardDist_gat_filt_sp_mat_pat1_hc_dend <- as.dendrogram(jacardDist_gat_filt_sp_mat_pat1_hc)
jacardDist_gat_filt_sp_mat_pat1_hc_dend <- color_labels(jacardDist_gat_filt_sp_mat_pat1_hc_dend, k = k_groups)
plot(jacardDist_gat_filt_sp_mat_pat1_hc_dend)

Code

pdf("jacardDist_gat_filt_sp_mat_pat1_hc_dend.pdf", height = 10, width = 20, useDingbats = F)
plot(jacardDist_gat_filt_sp_mat_pat1_hc_dend)
dev.off()

quartz_off_screen 
                2

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df = tibble(
  BiologicalSample = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups), 
  hcclust = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups
) %>% 
  # mutate(BiologicalSample =factor(BiologicalSample, levels = levels(meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering$BiologicalSample))) %>% 
  group_by(hcclust) %>% 
  mutate(hcclustSize = n())

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_biggerGroups = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% filter(hcclustSize != 1) %>% select(hcclust) %>% unique()
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_singletonGroups = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% filter(hcclustSize == 1) %>% select(hcclust) %>% unique()

newscheme <- iwanthue(seed = 626, force_init = TRUE); newscheme$hex(8)

[1] "#ba9d50" "#7947b8" "#8fcf52" "#c25191" "#78b795" "#c35540" "#979bc2" "#4c3c3d"

Code

# nonSingletonGroupsColors = createColorListFromDf(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_biggerGroups)$hcclust
nonSingletonGroupsColors = newscheme$hex(nrow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_biggerGroups))
names(nonSingletonGroupsColors) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_biggerGroups$hcclust
nonSingletonGroupsColors_singleton = rep("grey71", nrow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_singletonGroups))
names(nonSingletonGroupsColors_singleton) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_singletonGroups$hcclust
haploGroupColors = c(nonSingletonGroupsColors, nonSingletonGroupsColors_singleton)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
  select(hcclust, hcclustSize) %>% 
  ungroup() %>% 
  unique() %>% 
  arrange(desc(hcclustSize)) %>% 
  mutate(hcclust = as.character(hcclust),
         newClusterName = row_number()) %>% 
  left_join(tibble(
    hcclust = names(haploGroupColors), 
    colors = unname(haploGroupColors)
  )) %>% 
  mutate(Chr11DupHapCluster = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, pad = "0", width = 2)))

newHaploGroupColors = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$colors
names(newHaploGroupColors)= popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$newClusterName

newHaploGroupWithSingletColors = c(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$colors[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$hcclustSize > 1], 
                                   "grey77")
names(newHaploGroupWithSingletColors)= c(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$Chr11DupHapCluster[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$hcclustSize > 1], 
                                         "singlet")


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts %>% 
              mutate(hcclust = as.integer(hcclust)))

write_tsv(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df, "popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df.tsv")

Code

library(circlize)
#col_fun = colorRamp2(c(0, 0.5, 1), c(heat.colors(3)))
col_fun = colorRamp2(c(min(jacardDist_gat_filt_sp_mat), min(jacardDist_gat_filt_sp_mat) + (1-min(jacardDist_gat_filt_sp_mat))/2, 1), c( "#2166ac", "white", "#b2182b"))

jacardDist_gat_filt_sp_mat_noLabs = jacardDist_gat_filt_sp_mat
jacardDist_gat_filt_sp_mat_pat1 = jacardDist_gat_filt_sp_mat
meta_preferredSample = metaSelected %>% 
  filter(PreferredSample)
metaSelected_hrp3_pat1 = meta_preferredSample[match(rownames(jacardDist_gat_filt_sp_mat), meta_preferredSample$BiologicalSample), ]%>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
              ungroup() %>% 
              select(BiologicalSample, Chr11DupHapCluster))
metaSelected_hrp3_pat1 = metaSelected_hrp3_pat1 %>% 
  mutate(PerfectChr11Copy = BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample)
rownames(jacardDist_gat_filt_sp_mat_noLabs) = NULL
colnames(jacardDist_gat_filt_sp_mat_noLabs) = NULL
RowLabs = metaSelected_hrp3_pat1$BiologicalSample
RowLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = ""
ColLabs = metaSelected_hrp3_pat1$BiologicalSample
ColLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = ""
#RowLabs[metaSelected$country != "Ethiopia"] = ""
rownames(jacardDist_gat_filt_sp_mat_noLabs) = RowLabs
colnames(jacardDist_gat_filt_sp_mat_noLabs) = ColLabs


rowAnnoDf  = metaSelected_hrp3_pat1[,c("hrpCall", "PerfectChr11Copy", "country", "region", "secondaryRegion", "Chr11DupHapCluster")] %>% rename(continent = secondaryRegion) %>% as.data.frame()

annotationTextSize = 25 ;annotationTitleTextSize = 20;
rowAnnoColors[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors

topAnno = HeatmapAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors,
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  show_legend = F,
  gp = gpar(col = "grey10")
)
sideAnno = rowAnnotation(
  df = rowAnnoDf,
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  col = rowAnnoColors,
  gp = gpar(col = "grey10")
)
haptype_hrp3_pat1HeatMap = Heatmap(
  jacardDist_gat_filt_sp_mat_noLabs,
  cluster_columns = T,
  col = col_fun,
  name = "JacardIndex",
  top_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"), 
  heatmap_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(
      fontsize = annotationTextSize,
      fontface = "bold",
      title = "JacardIndex"
    )
  )
)

Samples with a duplicated chromosome 11 and deleted chr 13 (13-11++ of HRP3 deletion)

Jacard index of the duplicated region on chromosome 11, jacard of 1 means complete agreement between samples on this region which 0 would be no haplotypes shared in this region. Additional meta data of the samples is shown on top and to the right including country/region, and the hrp2/3 calls, whether the the Chr11 that has been duplicated is a perfect copy or not.

It appears the African samples and South American samples, while related within continent, are not very closely related to each other.

Code

draw(haptype_hrp3_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")

Code

pdf("haptype_hrp3_pat1.pdf", useDingbats = F, width = 25, height = 20)
draw(haptype_hrp3_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
dev.off()

quartz_off_screen 
                2

Similar samples to 13-11++

Below will get the samples that have a chromosome 11 that is similar to the 13-11++ samples (Deleted hrp3, duplicated sub-telomeric chr11 segment). These new similar samples will be regardless of hrp2/3 deletion status. This will show if the chr11 that has been duplicated is circulating in the general population or is only associated with the samples with HRP3 deletion and chr11 duplication.

Code

metaByBioSample_fieldOrIsolate = metaByBioSample %>% 
  filter(IsFieldSample | "LabIsolate" == site)


jacardDist_gat_filt_forOtherSimilarToPat1 = jacardDist_gat %>% 
  filter(sample      %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample |  
         otherSample %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample ) %>% 
  filter(sample %in% metaByBioSample_fieldOrIsolate$sample, 
         otherSample %in% metaByBioSample_fieldOrIsolate$sample) %>% 
  filter(sample %in% samplesCovered$BiologicalSample, 
         otherSample %in% samplesCovered$BiologicalSample) %>% 
  # filter(index > 0.99)
  filter(index > 0.98)


simSamples = c(unique(c(jacardDist_gat_filt_forOtherSimilarToPat1$sample, jacardDist_gat_filt_forOtherSimilarToPat1$otherSample, allDeletionTypeMeta_hrp3_pat1$BiologicalSample)))
simSamples = simSamples[simSamples != "FCR3"]
jacardDist_gat_filt_simToPat1 = jacardDist_gat %>% 
  filter(sample %in% simSamples, 
         otherSample %in% simSamples) %>% 
  mutate(index = ifelse(is.nan(index), 0, index))

jacardDist_gat_filt_simToPat1_sp = jacardDist_gat_filt_simToPat1 %>% 
  spread(otherSample, index)

jacardDist_gat_filt_simToPat1_sp_mat = as.matrix(jacardDist_gat_filt_simToPat1_sp[,2:ncol(jacardDist_gat_filt_simToPat1_sp)])
rownames(jacardDist_gat_filt_simToPat1_sp_mat) = jacardDist_gat_filt_simToPat1_sp$sample

Code

library(circlize)
#['#b2182b','#d6604d','#f4a582','#fddbc7','#f7f7f7','#d1e5f0','#92c5de','#4393c3','#2166ac']
col_fun = colorRamp2(c(min(jacardDist_gat_filt_simToPat1_sp_mat), min(jacardDist_gat_filt_simToPat1_sp_mat) + (1-min(jacardDist_gat_filt_simToPat1_sp_mat))/2, 1), c( "#2166ac", "white", "#b2182b"))
jacardDist_gat_filt_simToPat1_sp_mat_noLabs = jacardDist_gat_filt_simToPat1_sp_mat
meta_preferredSample = meta %>% 
  filter(PreferredSample)
metaSelected_hrp3_pat1 = meta_preferredSample[match(rownames(jacardDist_gat_filt_simToPat1_sp_mat), meta_preferredSample$BiologicalSample), ] %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
              ungroup() %>% 
              select(BiologicalSample, Chr11DupHapCluster))


sample_metadata_withAllDeletionCalls = readr::read_tsv("sample_metadata_withAllDeletionCalls.tsv")


metaSelected_hrp3_pat1 = metaSelected_hrp3_pat1 %>% 
  left_join(sample_metadata_withAllDeletionCalls %>% 
              rename(BiologicalSample = sample) %>% 
              select(BiologicalSample, Pattern)) %>% 
  mutate(PerfectChr11Copy = case_when(
    BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample ~ T, 
    BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum$s_Sample ~ F, 
    T ~ NA
  ))


# %>% 
#   mutate(hrpCall = ifelse(BiologicalSample %in% previousDeletionCalls$BiologicalSample, hrpCall, "unknown"))

rownames(jacardDist_gat_filt_simToPat1_sp_mat_noLabs) = NULL
colnames(jacardDist_gat_filt_simToPat1_sp_mat_noLabs) = NULL
RowLabs = metaSelected_hrp3_pat1$BiologicalSample
RowLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = ""
ColLabs = metaSelected_hrp3_pat1$BiologicalSample
ColLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = ""
#RowLabs[metaSelected$country != "Ethiopia"] = ""
rownames(jacardDist_gat_filt_simToPat1_sp_mat_noLabs) = RowLabs
colnames(jacardDist_gat_filt_simToPat1_sp_mat_noLabs) = ColLabs

rowAnnoDf  = metaSelected_hrp3_pat1[,c("Pattern", "hrpCall", "PerfectChr11Copy", "country", "region", "secondaryRegion", "Chr11DupHapCluster")] %>% rename(continent = secondaryRegion) %>% as.data.frame()

temp_rowAnnoColors = createColorListFromDf(rowAnnoDf)
temp_rowAnnoColors[["hrpCall"]] = pfhrpsCallColors
temp_rowAnnoColors[["continent"]] = continentColors
temp_rowAnnoColors[["region"]] = rowAnnoColors$region
temp_rowAnnoColors[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern
temp_rowAnnoColors[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors

annotationTextSize = 25 ;annotationTitleTextSize = 20;



topAnno = HeatmapAnnotation(
  df = rowAnnoDf,
  col = temp_rowAnnoColors,
  show_legend = F,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ), 
  na_col = c("#99999900")
)
sideAnno = rowAnnotation(
  df = rowAnnoDf,
  col = temp_rowAnnoColors,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  na_col = c("#99999900")
)
haptype_simTo_hrp3_pat1HeatMap = Heatmap(
  jacardDist_gat_filt_simToPat1_sp_mat_noLabs,
  cluster_columns = T,
  col = col_fun,
  name = "JacardIndex",
  top_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"), 
  heatmap_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(
      fontsize = annotationTextSize,
      fontface = "bold",
      title = "JacardIndex"
    )
  )
)

Below will get the samples that have a chromosome 11 that is similar to the 13-11++ samples. These new similar samples will be regardless of hrp2/3 deletion status. This will show if the chr11 that has been duplicated is circulating in the general population or is only associated with the samples with HRP3 deletion and chr11 duplication.

It appears that the duplicated chromosome 11 is circulating fairly commonly among South American samples that don’t have HRP3 deletion while there doesn’t appear to be any of the duplicated chr11 circulating in the African population (though could be a high diversity vs low diversity bias and/or sampling biases given the drastic differences in malaria dynamics in the two continents).

Code

draw(haptype_simTo_hrp3_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")

Code

pdf("haptype_simTo_hrp3_pat1.pdf", useDingbats = F, width = 30, height = 35)
draw(haptype_simTo_hrp3_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
dev.off()

quartz_off_screen 
                2

Plotting haplotypes typed per genomic region

Plotting out the variation at the duplicated region, coloring haplotypes by their abundance rank, this visualization will allow interpretation of how similar these haplotypes are here and what the copy looks like within sample (e.g. perfect copy vs variation and how much variation )

All samples with 13-11++ HRP3 deletion

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11, minPopSize = 1)

# select just the major haplotypes and cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep %>% 
  group_by(p_name) %>% 
  mutate(sampleCount = length(unique(s_Sample))) %>% 
  group_by() %>% 
  filter(sampleCount >= 0.99*max(sampleCount)) %>% 
  group_by(s_Sample, p_name) %>% 
  #filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_mat)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_dist)


jacardDist_gat_filt_sp_mat_pat1_hc = hclust(dist(jacardDist_gat_filt_sp_mat_pat1))

#rename the levels so they are in the order of the clustering 
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_dist_hclust$order])) %>%
                           # levels = rownames(jacardDist_gat_filt_sp_mat_pat1)[jacardDist_gat_filt_sp_mat_pat1_hc$order])) %>%
  mutate(popid = ifelse(maxPopid == 1, -1, popid))

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name)), 
                     labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name), 
                     expand = c(0,0))+ 
  scale_y_continuous(expand = c(0,0))

meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep = meta_preferredSample %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$s_Sample)  %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
              select(BiologicalSample, Chr11DupHapCluster)) %>% 
  mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$s_Sample)))

allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])}

previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot)$data[[1]][["fill"]])
names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$popid))
previousColors["-1"] = "grey0";
allColors = c(allColors, previousColors)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_dist_hclust$order])) %>% 
  mutate(popid= factor(popid))

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1 = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry$p_name))), 
                     labels = c("Chr11DupHapCluster", "continent", "region", "country",
                                # levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry$p_name), 
                                rep("", length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry$p_name)))
                                ), 
                     expand = c(0,0)) + 
  scale_y_continuous(
    expand = c(0, 0),
    breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry$s_Sample)),
    labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry$s_Sample)
  )


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1 + 
  scale_fill_manual("Microhaplotype\nRank",  values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]))  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill() + 
  geom_rect(aes(xmin= 0, xmax = -4.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep) + 
  scale_fill_manual("country",  values = rowAnnoColors[["country"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -5, xmax = -9.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep) + 
  scale_fill_manual("region",  values = rowAnnoColors[["region"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -10, xmax = -14.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep)+ 
  scale_fill_manual("Continent",  values = rowAnnoColors[["continent"]])  + 
  guides(fill = guide_legend(nrow = 3))  + 
  ggnewscale::new_scale_fill() +
  geom_rect(aes(xmin= -15, xmax = -19.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = factor(Chr11DupHapCluster)), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep)+
  scale_fill_manual("Chr11DupHapCluster",  values = newHaploGroupWithSingletColors, labels = names(newHaploGroupWithSingletColors),
                    breaks = names(newHaploGroupWithSingletColors))  + 
  guides(fill = guide_legend(nrow = 4))

The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype. Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal (and in this instance would mean the copy on chr11 and chr13 is not a perfect copy).

Code

regions_afterHomologous_chr11_filt = regions_afterHomologous_chr11 %>% 
  filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name) %>% 
  mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name)))

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1 + 
        new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -5, 
                      fill = description), 
                  data = regions_afterHomologous_chr11_filt, color  = "black") + 
        scale_fill_manual("Genes\nDescription", values = descriptionColors, 
                          guide = guide_legend(nrow = 5)) + 
  transparentBackground + theme(legend.text = element_text(size = 30), 
        legend.title = element_text(size = 30, face = "bold"), 
        legend.box="vertical", legend.margin=margin(),
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black"), 
        axis.text.x = element_text(size = 30)) 

print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1)

Code

pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot.pdf", useDingbats = F, width = 40, height = 30)
print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1)
dev.off()

quartz_off_screen 
                2

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11, minPopSize = 2)

# select just the major haplotypes and cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 %>% 
  group_by(p_name) %>% 
  mutate(sampleCount = length(unique(s_Sample))) %>% 
  group_by() %>% 
  filter(sampleCount >= 0.99*max(sampleCount)) %>% 
  group_by(s_Sample, p_name) %>% 
  #filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_mat)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_dist)


#rename the levels so they are in the order of the clustering 
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_dist_hclust$order])) %>%
  mutate(popid = ifelse(maxPopid == 1, -1, popid))

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$p_name)), 
                     labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$p_name), 
                     expand = c(0,0))

meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 = meta_preferredSample %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$s_Sample) %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
              select(BiologicalSample, Chr11DupHapCluster)) %>% 
  mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$s_Sample)))

allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])}

previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_plot)$data[[1]][["fill"]])
names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$popid))
previousColors["-1"] = "grey0";
allColors = c(allColors, previousColors)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_dist_hclust$order])) %>% 
  mutate(popid= factor(popid))



popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(limits = c(-30, max(c(-9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$p_name))))),
    breaks = c(-9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$p_name))), 
                     labels = c("HaploGroup", "continent", levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$p_name)), 
                     expand = c(0,0))

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 + 
  scale_fill_manual("Microhaplotype\nRank",  values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]))  + 
  guides(fill = guide_legend(nrow = 4)) + 
  ggnewscale::new_scale_fill() + 
  geom_rect(aes(xmin= 0, xmax = -4.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3)+ 
  scale_fill_manual("Continent",  values = rowAnnoColors[["continent"]])  + 
  guides(fill = guide_legend(nrow = 4)) + 
  ggnewscale::new_scale_fill() +
  geom_rect(aes(xmin= -5, xmax = -9.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = factor(Chr11DupHapCluster)), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep)+
  scale_fill_manual("Chr11DupHapCluster",  values = newHaploGroupWithSingletColors, labels = names(newHaploGroupWithSingletColors),
                    breaks = names(newHaploGroupWithSingletColors))  + 
  guides(fill = guide_legend(nrow = 4)) 

regions_afterHomologous_chr11_filt = regions_afterHomologous_chr11 %>% 
  filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$p_name) %>% 
  mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$p_name)))
yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$s_Sample)
yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3[yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 %!in% c("HB3", "Santa-Lucia-Salvador-I", "SD01")] = ""


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 + 
  scale_y_continuous(labels = yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3, 
                     breaks = 1:length(yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3), 
                     expand = c(0,0)) + 
  theme(axis.text.x = element_blank(), 
        axis.line.x = element_blank(), 
        axis.ticks.x = element_blank(), 
        axis.title.x = element_blank(), 
        axis.line.y = element_blank(), 
        axis.ticks.y = element_blank(), 
        axis.text.y = element_blank(), 
        axis.title.y = element_blank(), 
        panel.border = element_blank(), 
        )
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3_priorToGeneInfo = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 + 
        new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -7, 
                      fill = description), 
                  data = regions_afterHomologous_chr11_filt, color  = "black") + 
  geom_text(
    aes(y = as.numeric(BiologicalSample),
        x = -10, 
        label = BiologicalSample),
    hjust = 1,
    #data = tibble(BiologicalSample = factor(c("HB3", "Santa-Lucia-Salvador-I", "SD01"), levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$s_Sample)))
    data = tibble(BiologicalSample = factor(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$s_Sample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$s_Sample)))
  ) + 
        scale_fill_manual("Genes\nDescription", values = descriptionColors, 
                          guide = guide_legend(nrow = 4))

Code

print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3)

Collapsing parasites by same haplotypes

Code

pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot_mod3_onlyVariableSites.pdf", useDingbats = F, width = 25, height = 20)
print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3)
dev.off()

quartz_off_screen 
                2

Code

pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot_mod3_onlyVariableSites_noGeneInfo.pdf", useDingbats = F, width = 15, height = 12.5)
print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3_priorToGeneInfo)
dev.off()

quartz_off_screen 
                2

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% 
  group_by(s_Sample) %>%
  summarise(p_name_count = length(unique(p_name)), 
            p_name_meanCOI = mean(uniqHaps)) %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
              rename(s_Sample = BiologicalSample))

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness %>% 
  #filter(s_Sample  %!in%  c("HB3", "QV0040-C", "IGS-CBD-010")) %>% 
  #filter(hcclustSize > 2, newClusterName != 9) %>% 
  #filter(hcclustSize > 1, newClusterName != 9) %>% 
  filter(hcclustSize > 1) %>%
  arrange(desc(p_name_count), p_name_meanCOI) %>% 
  group_by(newClusterName) %>% 
  mutate(groupID = row_number()) %>% 
  filter(groupID == 1) %>% 
  left_join(meta_preferredSample %>% 
              select(BiologicalSample, secondaryRegion) %>% 
              rename(s_Sample = BiologicalSample))

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt %>% 
  mutate(secondaryRegion = factor(secondaryRegion, levels = c("S_AMERICA", "AFRICA", "ASIA"))) %>% 
  arrange(secondaryRegion, desc(hcclustSize)) %>% 
  mutate(s_Sample = factor(s_Sample, levels = .$s_Sample))



popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% 
                                                                                                           filter(s_Sample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt$s_Sample), minPopSize = 2)

# select just the major haplotypes and cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 %>% 
  group_by(p_name) %>% 
  mutate(sampleCount = length(unique(s_Sample))) %>% 
  group_by() %>% 
  filter(sampleCount >= 0.99*max(sampleCount)) %>% 
  group_by(s_Sample, p_name) %>% 
  # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_mat)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist)


jacardDist_gat_filt_sp_mat_pat1_hc = hclust(dist(jacardDist_gat_filt_sp_mat_pat1))

#rename the levels so they are in the order of the clustering 
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt$s_Sample))) %>% 
                           # levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust$order])) %>%
                           # levels = rownames(jacardDist_gat_filt_sp_mat_pat1)[jacardDist_gat_filt_sp_mat_pat1_hc$order])) %>%
  mutate(popid = ifelse(maxPopid == 1, -1, popid))

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$p_name)), 
                     labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$p_name), 
                     expand = c(0,0))

meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 = meta_preferredSample %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$s_Sample) %>% 
  mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$s_Sample)))

allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])}

previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_plot)$data[[1]][["fill"]])
names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$popid))
previousColors["-1"] = "grey0";
allColors = c(allColors, previousColors)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 %>% 
    mutate(s_Sample = factor(s_Sample, 
                           levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt$s_Sample))) %>% 
  # mutate(s_Sample = factor(s_Sample, 
  #                          levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust$order])) %>% 
  mutate(popid= factor(popid))



popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(limits = c(-30, max(c(-9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry$p_name))))),
    breaks = c(-9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry$p_name))), 
                     labels = c("HaploGroup", "continent", levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry$p_name)), 
    expand = c(0,0))

k_groups = nrow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt);
h_groups = 1.1;

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups = cutree(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust, k = k_groups)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend <- as.dendrogram(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend <- color_labels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend, k = k_groups)
plot(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups = cutree(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust, h = h_groups)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend <- as.dendrogram(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend <- color_labels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend, h = h_groups)
plot(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend)

Code

jacardDist_gat_filt_sp_mat_pat1_hc_groups = cutree(jacardDist_gat_filt_sp_mat_pat1_hc, k = k_groups)
jacardDist_gat_filt_sp_mat_pat1_hc_dend <- as.dendrogram(jacardDist_gat_filt_sp_mat_pat1_hc)
jacardDist_gat_filt_sp_mat_pat1_hc_dend <- color_labels(jacardDist_gat_filt_sp_mat_pat1_hc_dend, k = k_groups)
plot(jacardDist_gat_filt_sp_mat_pat1_hc_dend)

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df = tibble(
  BiologicalSample = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups), 
  hcclust = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups
) %>% 
  mutate(BiologicalSample =factor(BiologicalSample, levels = levels(meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$BiologicalSample))) %>% 
  group_by(hcclust) %>% 
  mutate(hcclustSize = n())

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_biggerGroups = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>% filter(hcclustSize != 1) %>% select(hcclust) %>% unique()
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_singletonGroups = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>% filter(hcclustSize == 1) %>% select(hcclust) %>% unique()

nonSingletonGroupsColors = scheme$hex(nrow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>% select(hcclust) %>% unique()))  
names(nonSingletonGroupsColors) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_biggerGroups$hcclust
nonSingletonGroupsColors_singleton = scheme$hex(nrow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>% select(hcclust) %>% unique())) 
names(nonSingletonGroupsColors_singleton) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_singletonGroups$hcclust
haploGroupColors = c(nonSingletonGroupsColors, nonSingletonGroupsColors_singleton)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_groupCounts = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>% 
  select(hcclust, hcclustSize) %>% 
  ungroup() %>% 
  unique() %>% 
  arrange(desc(hcclustSize)) %>% 
  mutate(hcclust = as.character(hcclust),newClusterName = row_number()) %>% 
  left_join(tibble(
    hcclust = names(haploGroupColors), 
    colors = unname(haploGroupColors)
  ))
newHaploGroupColors = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_groupCounts$colors
names(newHaploGroupColors)= popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_groupCounts$newClusterName

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>%
  left_join(
    popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_groupCounts %>%
      mutate(hcclust = as.integer(hcclust))
  ) %>%
  left_join(
    popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% select(BiologicalSample, hcclustSize) %>% rename(originalGroupSize = hcclustSize)
  ) %>% 
  mutate(BiologicalSample = factor(BiologicalSample, levels = levels(meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$BiologicalSample)))


# jacardDist_gat_filt_sp_mat_pat1_hc_groups_df = tibble(
#   BiologicalSample = names(jacardDist_gat_filt_sp_mat_pat1_hc_groups), 
#   hcclust = jacardDist_gat_filt_sp_mat_pat1_hc_groups
# ) %>% 
#   mutate(BiologicalSample =factor(BiologicalSample, levels = levels(meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$BiologicalSample)))
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 + 
  scale_fill_manual("Microhaplotype\nRank",  values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]) )  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill() + 
  geom_rect(aes(xmin= 0, xmax = -4.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.3, 
                fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4)+ 
  scale_fill_manual("Continent",  values = rowAnnoColors[["continent"]])  + 
  guides(fill = guide_legend(nrow = 4)) + 
  # ggnewscale::new_scale_fill() +
  # geom_rect(aes(xmin= -5, xmax = -9.5,
  #               ymin = as.numeric(BiologicalSample) - 0.5, 
  #               ymax = as.numeric(BiologicalSample) + 0.3, 
  #               fill = factor(newClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df)+
  #               # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ 
  # # scale_fill_manual("HaploGroup",  values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups))))  + 
  # #scale_fill_manual("Chr11DupHapCluster",  values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors))  + 
  # scale_fill_manual("Chr11DupHapCluster",  values = newHaploGroupColors, labels = names(newHaploGroupColors),
  #                   breaks = names(newHaploGroupColors)) +
  geom_text(aes(
    x = -9.5, 
    y = as.numeric(BiologicalSample) - 0.5 + 0.4,
                label = paste0("n=", originalGroupSize)
                ), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df)+
  guides(fill = guide_legend(nrow = 4)) 

regions_afterHomologous_chr11_filt = regions_afterHomologous_chr11 %>% 
  filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$p_name) %>% 
  mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$p_name)))
yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry$s_Sample)
# yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4[yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 %!in% c("HB3", "Santa-Lucia-Salvador-I", "SD01")] = ""


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 + 
  scale_y_continuous(labels = yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4, 
                     breaks = 1:length(yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4), 
                     expand = c(0,0)) + 
  theme(axis.text.x = element_blank(), 
        axis.line.x = element_blank(), 
        axis.ticks.x = element_blank(), 
        axis.title.x = element_blank(), 
        axis.line.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        panel.border = element_blank(), 
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black", linewidth = 1)
        )
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4_priorToGeneInfo = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 + 
        new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -1, 
                      fill = description), 
                  data = regions_afterHomologous_chr11_filt, color  = "black") + 
  geom_text(
    aes(y = as.numeric(BiologicalSample),
        x = -10, 
        label = BiologicalSample),
    hjust = 1,
    data = tibble(BiologicalSample = factor(c("HB3", "Santa-Lucia-Salvador-I", "SD01"), levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry$s_Sample)))
  ) + 
        scale_fill_manual("Genes\nDescription", values = descriptionColors, 
                          guide = guide_legend(nrow = 4))

Code

print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4)

Code

pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot_mod4_onlyVariableSites.pdf", useDingbats = F, width = 25, height = 15)
print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4)
dev.off()

quartz_off_screen 
                2

Code

pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot_mod4_onlyVariableSites_noGeneInfo.pdf", useDingbats = F, width = 15, height = 6)
print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4_priorToGeneInfo)
dev.off()

quartz_off_screen 
                2

Perfect copies

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% 
                                                                                                  filter(s_Sample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample) , minPopSize = 1)

# select just the major haplotypes and cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies  %>% 
  group_by(p_name) %>% 
  mutate(sampleCount = length(unique(s_Sample))) %>% 
  group_by() %>% 
  filter(sampleCount > 0.9*max(sampleCount))  %>% 
  group_by(s_Sample, p_name) %>% 
  # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_mat)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_dist)

#rename the levels so they are in the order of the clustering 
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_dist_hclust$order]))%>% 
  mutate(popid = ifelse(maxPopid == 1, -1, popid))
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$p_name)), 
                     labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$p_name), 
                     expand = c(0,0))




meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies = meta_preferredSample %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$s_Sample) %>% 
  mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$s_Sample)))

allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])}

previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_plot)$data[[1]][["fill"]])
names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$popid))
previousColors["-1"] = "grey0";
allColors = c(allColors, previousColors)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_dist_hclust$order])) %>% 
  mutate(popid= factor(popid))
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry$p_name))), 
                     labels = c("Chr11DupHapCluster", "continent", "region", "country", 
                                rep("", length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry$p_name)))
                                # levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry$p_name)
                                ), 
                     expand = c(0,0)) + 
  scale_y_continuous(
    expand = c(0, 0),
    breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry$s_Sample)),
    labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry$s_Sample)
  )




popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies = 
  popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
  ungroup() %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$s_Sample) %>% 
  mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$s_Sample))) %>% 
  mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0")))

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies %>% 
  select(Chr11DupHapClusterName, colors) %>% 
  unique() %>% 
  arrange(Chr11DupHapClusterName)

# 

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColorsDf$colors
names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot + 
  scale_fill_manual("Microhaplotype\nRank",   values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]))  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill() + 
  geom_rect(aes(xmin= 0, xmax = -4.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies) + 
  scale_fill_manual("country",  values = rowAnnoColors[["country"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  
  geom_rect(aes(xmin= -5, xmax = -9.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies) + 
  scale_fill_manual("region",  values = rowAnnoColors[["region"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -10, xmax = -14.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies)+ 
  scale_fill_manual("Continent",  values = rowAnnoColors[["continent"]])  + 
  guides(fill = guide_legend(nrow = 3))  + 
  ggnewscale::new_scale_fill() +
  geom_rect(aes(xmin= -15, xmax = -19.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies)+
                # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ 
  # scale_fill_manual("HaploGroup",  values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups))))  + 
  #scale_fill_manual("Chr11DupHapCluster",  values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors))  + 
  scale_fill_manual("Chr11DupHapCluster",  values = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColors),
                    breaks = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColors))  + 
  guides(fill = guide_legend(nrow = 4))

The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal (and in this instance would mean the copy on chr11 and chr13 is not a perfect copy)

Code

regions_afterHomologous_chr11_filt = regions_afterHomologous_chr11 %>% 
  filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name) %>% 
  mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name)))

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot + 
        new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -5, 
                      fill = description), 
                  data = regions_afterHomologous_chr11_filt, color  = "black") + 
        scale_fill_manual("Genes\nDescription", values = descriptionColors, 
                          guide = guide_legend(nrow = 5)) + 
  transparentBackground + theme(legend.text = element_text(size = 30), 
        legend.title = element_text(size = 30, face = "bold"), 
        legend.box="vertical", legend.margin=margin(),
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black"), 
        axis.text.x = element_text(size = 30))


print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot)

Code

pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_plot.pdf", useDingbats = F, width = 40, height = 35)
print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot + labs(title = "Perfect Copies"))
dev.off()

quartz_off_screen 
                2

Divergent copies

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% 
                                                                                                  filter(s_Sample %!in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample) , minPopSize = 1)


# select just the major haplotypes and cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies %>% 
  group_by() %>% 
  filter(samp_n > 0.9*max(samp_n)) %>% 
  group_by(s_Sample, p_name) %>% 
  
  #filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_dist)

nameOrderFromforClustering = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust$order]
orderForDivergentCopy = nameOrderFromforClustering[nameOrderFromforClustering  %in% rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat)]



#rename the levels so they are in the order of the clustering 
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies %>% 
  mutate(s_Sample = factor(s_Sample, 
                           # levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_dist_hclust$order]))%>% 
                           levels = orderForDivergentCopy)) %>% 
  mutate(popid = ifelse(maxPopid == 1, -1, popid))
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$p_name)), 
                     labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$p_name), 
                     expand = c(0,0))







meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies = meta_preferredSample %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$s_Sample) %>% 
  mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$s_Sample)))

allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])}

previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_plot)$data[[1]][["fill"]])
names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$popid))
previousColors["-1"] = "grey0";
allColors = c(allColors, previousColors)


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies %>% 
  mutate(s_Sample = factor(s_Sample, 
                           # levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_dist_hclust$order])) %>% 
                           levels = orderForDivergentCopy)) %>% 
  mutate(popid= factor(popid))


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies = 
  popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
  ungroup() %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$s_Sample) %>% 
  mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$s_Sample))) %>% 
  mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0"))) %>% 
  arrange(BiologicalSample)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies %>% 
  select(Chr11DupHapClusterName, colors) %>% 
  unique() %>% 
  arrange(Chr11DupHapClusterName)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColorsDf$colors
names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry$p_name))), 
                     labels = c("Chr11DupHapCluster", "continent", "region", "country", 
                                rep("", length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry$p_name)))
                                # levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry$p_name)
                                ), 
                     expand = c(0,0))+ 
  scale_y_continuous(
    expand = c(0, 0),
    breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry$s_Sample)),
    labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry$s_Sample)
  )


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot+ 
  scale_fill_manual("Microhaplotype\nRank",   values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]))  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill() + 
  geom_rect(aes(xmin= 0, xmax = -4.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies) + 
  scale_fill_manual("country",  values = rowAnnoColors[["country"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  
  geom_rect(aes(xmin= -5, xmax = -9.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies) + 
  scale_fill_manual("region",  values = rowAnnoColors[["region"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -10, xmax = -14.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies)+ 
  scale_fill_manual("Continent",  values = rowAnnoColors[["continent"]])  + 
  guides(fill = guide_legend(nrow = 3)) +
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -15, xmax = -19.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies)+
                # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ 
  # scale_fill_manual("HaploGroup",  values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups))))  + 
  #scale_fill_manual("Chr11DupHapCluster",  values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors))  + 
  scale_fill_manual("Chr11DupHapCluster",  values = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColors),
                    breaks = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColors))  + 
  guides(fill = guide_legend(nrow = 4))

The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal (and in this instance would mean the copy on chr11 and chr13 is not a perfect copy)

Code

regions_afterHomologous_chr11_filt = regions_afterHomologous_chr11 %>% 
  filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name) %>% 
  mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name)))

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot + 
        new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -1, 
                      fill = description), 
                  data = regions_afterHomologous_chr11_filt, color  = "black") + 
        scale_fill_manual("Genes\nDescription", values = descriptionColors, 
                          guide = guide_legend(nrow = 5)) + 
  transparentBackground + theme(legend.text = element_text(size = 30), 
        legend.title = element_text(size = 30, face = "bold"), 
        legend.box="vertical", legend.margin=margin(),
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black"), 
        axis.text.x = element_text(size = 30))


print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot)

Code

pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_plot.pdf", useDingbats = F, width = 40, height = 30)
print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot + labs(title = "Divergent Copies"))
dev.off()

quartz_off_screen 
                2

Sub set

SD01, HB3, Santa-Lucia-Salvador-I

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% 
                                                                                                  filter(s_Sample %in% c("HB3", "SD01", "Santa-Lucia-Salvador-I")) , minPopSize = 1)


# select just the major haplotypes and cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates %>% 
  group_by(p_name) %>% 
  mutate(sampleCount = length(unique(s_Sample)))%>% 
  group_by() %>% 
  filter(sampleCount > 0.9*max(sampleCount)) %>% 
  group_by(s_Sample, p_name) %>% 
  
  # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_mat)
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_dist)

#rename the levels so they are in the order of the clustering 
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_dist_hclust$order]))
popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name)), 
                     # labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name), 
                     labels = rep("", length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name))), 
                     expand = c(0,0))


previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot)$data[[1]][["fill"]])
names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$popid))
previousColors["-1"] = "grey0";

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates%>% 
  mutate(popid= factor(popid)), colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name)), 
                     # labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name), 
                     labels = rep("", length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name))), 
                     expand = c(0,0))

The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal (and in this instance would mean the copy on chr11 and chr13 is not a perfect copy)

It appears that SD01 and Santa-Lucia-Salvador-I have perfect copies of chr11 on chr11 and chr13 while HB3 has a divergent copy (which is confirmed with the nanopore assembly)

Interestingly enough, the Santa-Lucia-Salvador-I chr11 duplicated region appears to be one of the chr11 in HB3.

Code

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot + 
  scale_fill_manual("Microhaplotype\nRank",   values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]))  +
  guides(fill = guide_legend(nrow = 1)) +
  ggnewscale::new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -1, 
                      fill = description), 
                  data = regions_afterHomologous_chr11_filt, color  = "black") + 
        scale_fill_manual("Genes\nDescription", values = descriptionColors, 
                          guide = guide_legend(nrow = 5)) + 
  transparentBackground + theme(legend.text = element_text(size = 30), 
        legend.title = element_text(size = 30, face = "bold"), 
        legend.box="vertical", legend.margin=margin(),
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black"), 
        axis.text.x = element_text(size = 30), 
        axis.text.y = element_text(size = 30))

print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot)

Code

pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot.pdf", useDingbats = F, width = 40, height = 7.5)
print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot)
dev.off()

quartz_off_screen 
                2

Shared Region between chr11 and chr13

The data on the 15.2kb duplicated region between chromosome 11 and 13.

Code

excludeRegions = c("Pf3D7_11_v3-1919633-1920323-for__var-6", 
                   
                   "Pf3D7_11_v3-1920483-1921173-for__var-3", 
                   "Pf3D7_11_v3-1920483-1921173-for__var-4", 
                   "Pf3D7_11_v3-1920483-1921173-for__var-5", 
                   "Pf3D7_11_v3-1920483-1921173-for__var-6", 
                   "Pf3D7_11_v3-1920483-1921173-for__var-7", 
                   
                   "Pf3D7_11_v3-1928369-1928869-for__var-3", 
                   
                   "Pf3D7_11_v3-1928619-1929119-for__var-3")



regions_homologousRegion = regions %>% 
  filter("shared" == homologousRegion) %>% 
  filter(`#chrom` == "Pf3D7_11_v3") %>% 
  filter(name %!in% excludeRegions)


popClustering_filt_hrp3_pat1_regions_homologousRegion = popClustering_filt_hrp3_pat1 %>% 
  filter(p_name %in% regions_homologousRegion$genomicID)


popClustering_filt_hrp3_pat1_regions_homologousRegion = popClustering_filt_hrp3_pat1_regions_homologousRegion %>% 
  group_by(s_Sample, p_name) %>% 
  mutate(uniqHaps= n())

popClustering_filt_hrp3_pat1_regions_homologousRegion_uniqSum = popClustering_filt_hrp3_pat1_regions_homologousRegion %>% 
  group_by(s_Sample) %>% 
  mutate(targets = length(unique(genomicID))) %>% 
  group_by(s_Sample, targets, uniqHaps) %>% 
  count() %>% 
  mutate(freq = n/targets)


popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum = popClustering_filt_hrp3_pat1_regions_homologousRegion %>% 
  mutate(marker = uniqHaps == 1) %>% 
  group_by(s_Sample) %>% 
  summarise(conserved = sum(marker), 
            targets = length(unique(genomicID))) %>% 
  mutate(conservedID = conserved/targets)

conservedCutOff = 0.99

popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_closeToPerfectCopies = popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum %>%
  filter(conservedID > conservedCutOff)

popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOff = popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum %>% 
  mutate(marker = conservedID > conservedCutOff) %>% 
  group_by() %>% 
  summarise(perfectDuplication = sum(marker), 
            totalSamps = length(unique(s_Sample))) %>% 
  mutate(perfectCopyFreq = perfectDuplication/totalSamps)

popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOffByRegion = popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum %>% 
  mutate(marker = conservedID > conservedCutOff) %>% 
  left_join(metaByBioSample %>% 
              rename(s_Sample = sample)) %>% 
  group_by(region) %>% 
  summarise(perfectDuplication = sum(marker), 
            totalSamps = length(unique(s_Sample))) %>% 
  mutate(perfectCopyFreq = perfectDuplication/totalSamps)

popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOffByContinent = popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum %>% 
  mutate(marker = conservedID > conservedCutOff) %>% 
  left_join(metaByBioSample %>% 
              rename(s_Sample = sample)) %>% 
  group_by(secondaryRegion) %>% 
  summarise(perfectDuplication = sum(marker), 
            totalSamps = length(unique(s_Sample))) %>% 
  mutate(perfectCopyFreq = perfectDuplication/totalSamps)

The number of samples with perfect copies

Code

create_dt(popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOff)

Show entries

Search:

	perfectDuplication	totalSamps	perfectCopyFreq
	0 1	0 1	0.241666666666666 0.241666666666667
1	29	120	0.2416666666666667

Showing 1 to 1 of 1 entries

Previous1Next

The number of samples with perfect copies broken down by regions

Code

create_dt(popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOffByRegion)

Show entries

Search:

	region	perfectDuplication	totalSamps	perfectCopyFreq
		0 12	2 42	0.000000000000000 0.500000000000000
1	East Africa	11	23	0.4782608695652174
2	South America - Central	1	33	0.0303030303030303
3	South America - North	12	42	0.2857142857142857
4	South East Asia - East	2	4	0.5
5	South East Asia - West	0	2	0
6	West Africa	3	16	0.1875

Showing 1 to 6 of 6 entries

Previous1Next

The number of samples with perfect copies broken down by continent.

Code

create_dt(popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOffByContinent)

Show entries

Search:

	secondaryRegion	perfectDuplication	totalSamps	perfectCopyFreq
		2 14	6 75	0.173333333333333 0.358974358974359
1	AFRICA	14	39	0.358974358974359
2	ASIA	2	6	0.3333333333333333
3	S_AMERICA	13	75	0.1733333333333333

Showing 1 to 3 of 3 entries

Previous1Next

Code

popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_meanId = popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum %>% 
  filter(conservedID <= conservedCutOff) %>% 
  summarise(meanID = mean(conservedID), 
            minID = min(conservedID), 
            sdID = sd(conservedID))

The breakdown of level of divergence in the samples with divergent samples.

Code

create_dt(popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_meanId)

Show entries

Search:

	meanID	minID	sdID
	0.927735134071869 0.927735134071870	0.792079207920792 0.792079207920793	0.049818411326546 0.049818411326547
1	0.9277351340718696	0.7920792079207921	0.04981841132654685

Showing 1 to 1 of 1 entries

Previous1Next

Population analysis of chr11/chr13 shared region

Calculating the population of the haplotypes of the shared region on chr 11/chr13

Code

popClustering_filt_regions_homologousRegion = popClustering %>% 
  filter(genomicID %!in% erroneousRegions)  %>% 
  filter(p_name %in% regions_homologousRegion$genomicID)

popClustering_filt_regions_homologousRegion_tarCounts = popClustering_filt_regions_homologousRegion %>% 
  group_by(s_Sample) %>% 
  summarise(tarCounts = length(unique(p_name)))

popClustering_filt_regions_homologousRegion_tarCounts_filt = popClustering_filt_regions_homologousRegion_tarCounts %>% 
  filter(tarCounts >= 0.80 * max(tarCounts) | 
           s_Sample %in% previousDeletionCalls$BiologicalSample)

popClustering_filt_regions_homologousRegion_sampCounts = popClustering_filt_regions_homologousRegion %>% 
  group_by(p_name) %>% 
  summarise(sampCounts = length(unique(s_Sample)))



write_tsv(popClustering_filt_regions_homologousRegion %>%
            filter(s_Sample %in% popClustering_filt_regions_homologousRegion_tarCounts_filt$s_Sample) %>% 
            group_by() %>% 
            select(s_Sample, p_name, h_popUID, c_AveragedFrac), 
          "popClustering_filt_regions_homologousRegion.tab.txt.gz")

Code

elucidator doPairwiseComparisonOnHapsSharingDev --tableFnp popClustering_filt_regions_homologousRegion.tab.txt.gz --sampleCol s_Sample --targetNameCol p_name --popIDCol h_popUID --relAbundCol c_AveragedFrac --numThreads 14 --dout pairwiseComps_regions_homologousRegion --verbose --overWriteDir --metaFnp metaByBioSample_outwithHrpCalls.tab.txt --metaFieldsToCalcPopDiffs country,region,secondaryRegion,HRP3_deletionPattern --writeOutDistMatrices

Code

#jacardDist = readr::read_tsv("pairwiseComps_regions_homologousRegion/percOfTarSharingAtLeastOneHap.tab.txt.gz", col_names = F)

jacardDist_homologousRegion = readr::read_tsv("pairwiseComps_regions_homologousRegion/jacardByHapsTarShared.tab.txt.gz", col_names = F)

jacardDist_homologousRegionSamps = readr::read_tsv("pairwiseComps_regions_homologousRegion/sampleNames.tab.txt", col_names = "samples")
colnames(jacardDist_homologousRegion) = jacardDist_homologousRegionSamps$samples
jacardDist_homologousRegion$sample = jacardDist_homologousRegionSamps$samples

# jacardDist_homologousRegion_filt = jacardDist_homologousRegion %>% 
#   filter(sample %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample)

jacardDist_homologousRegion_filt = jacardDist_homologousRegion[jacardDist_homologousRegion$sample  %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample,c(allDeletionTypeMeta_hrp3_pat1$BiologicalSample, "sample")]

jacardDist_homologousRegion_gat = jacardDist_homologousRegion_filt %>% 
  gather(otherSample, index,1:(ncol(.) - 1))

jacardDist_homologousRegion_gat_filt = jacardDist_homologousRegion_gat %>% 
  filter(sample %fin% allDeletionTypeMeta_hrp3_pat1$BiologicalSample, 
         otherSample %fin% allDeletionTypeMeta_hrp3_pat1$BiologicalSample)

jacardDist_homologousRegion_gat_filt_sp = jacardDist_homologousRegion_gat_filt %>% 
  spread(otherSample, index)

jacardDist_homologousRegion_gat_filt_sp_mat = as.matrix(jacardDist_homologousRegion_gat_filt_sp[,2:ncol(jacardDist_homologousRegion_gat_filt_sp)])
rownames(jacardDist_homologousRegion_gat_filt_sp_mat) = jacardDist_homologousRegion_gat_filt_sp$sample

Code

library(circlize)
#col_fun = colorRamp2(c(0, 0.5, 1), c(heat.colors(3)))
col_fun = colorRamp2(c(min(jacardDist_homologousRegion_gat_filt_sp_mat), min(jacardDist_homologousRegion_gat_filt_sp_mat) + (1-min(jacardDist_homologousRegion_gat_filt_sp_mat))/2, 1), c( "#2166ac", "white", "#b2182b"))

jacardDist_homologousRegion_gat_filt_sp_mat_noLabs = jacardDist_homologousRegion_gat_filt_sp_mat

meta_preferredSample = meta %>% 
  filter(PreferredSample)
metaSelected_hrp3_pat1 = meta_preferredSample[match(rownames(jacardDist_homologousRegion_gat_filt_sp_mat), meta_preferredSample$BiologicalSample), ]
metaSelected_hrp3_pat1 = metaSelected_hrp3_pat1 %>% 
  mutate(PerfectChr11Copy = BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample) %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
              ungroup() %>% 
              mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% 
              mutate(BiologicalSample = as.character(BiologicalSample)) %>% 
              select(BiologicalSample, newClusterName)) 

rownames(jacardDist_homologousRegion_gat_filt_sp_mat_noLabs) = NULL
colnames(jacardDist_homologousRegion_gat_filt_sp_mat_noLabs) = NULL
RowLabs = metaSelected_hrp3_pat1$BiologicalSample
RowLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = ""
ColLabs = metaSelected_hrp3_pat1$BiologicalSample
ColLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = ""
#RowLabs[metaSelected$country != "Ethiopia"] = ""
rownames(jacardDist_homologousRegion_gat_filt_sp_mat_noLabs) = RowLabs
colnames(jacardDist_homologousRegion_gat_filt_sp_mat_noLabs) = ColLabs






rowAnnoDf  = metaSelected_hrp3_pat1[,c("hrpCall", "PerfectChr11Copy", "country", "region", "secondaryRegion", "newClusterName")] %>% rename(continent = secondaryRegion, 
                                                                                                                                            Chr11DupHapCluster = newClusterName) %>% as.data.frame()

annotationTextSize = 25 ;annotationTitleTextSize = 20;
rowAnnoColors_mod = rowAnnoColors

rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors

topAnno = HeatmapAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  show_legend = F,
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  gp = gpar(col = "grey10")
)
sideAnno = rowAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  gp = gpar(col = "grey10")
)

haptype_hrp3_regions_homologousRegion_pat1HeatMap = Heatmap(
  jacardDist_homologousRegion_gat_filt_sp_mat_noLabs,
  cluster_columns = T,
  col = col_fun,
  name = "JacardIndex",
  top_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"),
  heatmap_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(
      fontsize = annotationTextSize,
      fontface = "bold",
      title = "JacardIndex"
    )
  )
)

Code

draw(haptype_hrp3_regions_homologousRegion_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")

Code

pdf("haptype_hrp3_regions_homologousRegion_pat1HeatMap.pdf", useDingbats = F, width = 25, height = 20)
draw(haptype_hrp3_regions_homologousRegion_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
dev.off()

quartz_off_screen 
                2

Plotting haplotypes

All

Code

regions_homologousRegion = regions_homologousRegion %>% 
  mutate(description = case_when(
    grepl("extraField0=NA", extraField0) ~ "intergenic", 
    T ~ gsub("\\]", "", gsub(".*description=", "", extraField0))
  )  )

descriptionColors_homologousRegion = scheme$hex(length(regions_homologousRegion$description %>% unique()))
names(descriptionColors_homologousRegion) = regions_homologousRegion$description %>% unique()
descriptionColors_homologousRegion["intergenic"] = c("#FF000000")

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_homologousRegion, minPopSize = 1)

# select just the major haplotypes and cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep %>% 
  group_by(p_name) %>% 
  mutate(sampleCount = length(unique(s_Sample)))%>% 
  group_by() %>% 
  filter(sampleCount > 0.9*max(sampleCount)) %>% 
  group_by(s_Sample, p_name) %>% 
  # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat)
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist)

#rename the levels so they are in the order of the clustering 
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist_hclust$order])) %>% 
  mutate(popid = ifelse(maxPopid == 1, -1, popid))
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$p_name)), 
                     labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$p_name), 
                     expand = c(0,0)) + 
  scale_y_continuous(
    expand = c(0, 0),
    breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$s_Sample)),
    labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$s_Sample)
  )
meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep = meta_preferredSample %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$s_Sample) %>% 
  mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$s_Sample)))

allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])}

previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_plot)$data[[1]][["fill"]])
names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$popid))
previousColors["-1"] = "grey0";
allColors = c(allColors, previousColors)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist_hclust$order])) %>% 
  mutate(popid= factor(popid))
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$p_name))), 
                     labels = c("Chr11DupHapCluster", "continent", "region", "country", 
                                rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$p_name)))
                                # levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$p_name)
                                ), 
                     expand = c(0,0))+ 
  scale_y_continuous(
    expand = c(0, 0),
    breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$s_Sample)),
    labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$s_Sample)
  )


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry = 
  popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
  ungroup() %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$s_Sample) %>% 
  mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$s_Sample))) %>% 
  mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0"))) %>% 
  arrange(BiologicalSample)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry %>% 
  select(Chr11DupHapClusterName, colors) %>% 
  unique() %>% 
  arrange(Chr11DupHapClusterName)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColorsDf$colors
names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName



popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot + 
  scale_fill_manual("Microhaplotype\nRank",  values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]))  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill() + 
  geom_rect(aes(xmin= 0, xmax = -4.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep) + 
  scale_fill_manual("country",  values = rowAnnoColors[["country"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  
  geom_rect(aes(xmin= -5, xmax = -9.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep) + 
  scale_fill_manual("region",  values = rowAnnoColors[["region"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -10, xmax = -14.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep)+ 
  scale_fill_manual("Continent",  values = rowAnnoColors[["continent"]])  + 
  guides(fill = guide_legend(nrow = 3))  +
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -15, xmax = -19.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry)+
                # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ 
  # scale_fill_manual("HaploGroup",  values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups))))  + 
  #scale_fill_manual("Chr11DupHapCluster",  values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors))  + 
  scale_fill_manual("Chr11DupHapCluster",  values = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColors),
                    breaks = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColors))  + 
  guides(fill = guide_legend(nrow = 4))

The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal

Code

regions_homologousRegion_filt = regions_homologousRegion %>% 
  filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$p_name) %>% 
  mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$p_name)))

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot_final = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot + 
        new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -10, 
                      fill = description), 
                  data = regions_homologousRegion_filt, color  = "black") + 
        scale_fill_manual("Genes\nDescription", values = descriptionColors_homologousRegion, 
                          guide = guide_legend(nrow = 2) )  + 
  transparentBackground + theme(legend.text = element_text(size = 30), 
        legend.title = element_text(size = 30, face = "bold"), 
        legend.box="vertical", legend.margin=margin(),
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black"), 
        axis.text.x = element_text(size = 30)) 

print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot_final )

Code

pdf("popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_plot.pdf",useDingbats = F,width = 30,height = 25)
print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot_final)
dev.off()

quartz_off_screen 
                2

Perfect copies

Code

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_homologousRegion %>% 
                                                                                                  filter(s_Sample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_closeToPerfectCopies$s_Sample) , minPopSize = 1)

# select just the major haplotypes and cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies  %>% 
  group_by(p_name) %>% 
  mutate(sampleCount = length(unique(s_Sample)))%>% 
  group_by() %>% 
  filter(sampleCount > 0.9*max(sampleCount))  %>% 
  group_by(s_Sample, p_name) %>% 
  # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_mat) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_mat)
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_dist)

#rename the levels so they are in the order of the clustering 
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_dist_hclust$order]))%>% 
  mutate(popid = ifelse(maxPopid == 1, -1, popid))
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies$p_name)), 
                     labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies$p_name), 
                     expand = c(0,0))+ 
  scale_y_continuous(expand = c(0,0))




meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies = meta_preferredSample %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies$s_Sample) %>% 
  mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies$s_Sample)))

allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])}

previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_plot)$data[[1]][["fill"]])

names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies$popid))
previousColors["-1"] = "grey0";
allColors = c(allColors, previousColors)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_dist_hclust$order])) %>% 
  mutate(popid= factor(popid))
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$p_name))), 
                     labels = c("Chr11DupHapCluster", "continent", "region", "country", 
                                rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$p_name)))
                                # levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$p_name)
                                ), 
                     expand = c(0,0))+ 
  scale_y_continuous(
    expand = c(0, 0),
    breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$s_Sample)),
    labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$s_Sample)
  )


popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry = 
  popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
  ungroup() %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$s_Sample) %>% 
  mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$s_Sample))) %>% 
  mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0"))) %>% 
  arrange(BiologicalSample)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry %>% 
  select(Chr11DupHapClusterName, colors) %>% 
  unique() %>% 
  arrange(Chr11DupHapClusterName)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColorsDf$colors
names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName



popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot+ 
  scale_fill_manual("Microhaplotype\nRank",  values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]))  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill() + 
  geom_rect(aes(xmin= 0, xmax = -4.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies) + 
  scale_fill_manual("country",  values = rowAnnoColors[["country"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  
  geom_rect(aes(xmin= -5, xmax = -9.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies) + 
  scale_fill_manual("region",  values = rowAnnoColors[["region"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -10, xmax = -14.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies)+ 
  scale_fill_manual("Continent",  values = rowAnnoColors[["continent"]])  + 
  guides(fill = guide_legend(nrow = 3))   +
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -15, xmax = -19.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry)+
                # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ 
  # scale_fill_manual("HaploGroup",  values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups))))  + 
  #scale_fill_manual("Chr11DupHapCluster",  values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors))  + 
  scale_fill_manual("Chr11DupHapCluster",  values = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColors),
                    breaks = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColors))  + 
  guides(fill = guide_legend(nrow = 4))

The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal

Code

regions_homologousRegion_filt = regions_homologousRegion %>% 
  filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$p_name) %>% 
  mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$p_name)))

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot_final = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot + 
        new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -1, 
                      fill = description), 
                  data = regions_homologousRegion_filt, color  = "black") + 
        scale_fill_manual("Genes\nDescription", values = descriptionColors_homologousRegion, 
                          guide = guide_legend(nrow = 2))+ 
  transparentBackground + theme(legend.text = element_text(size = 30), 
        legend.title = element_text(size = 30, face = "bold"), 
        legend.box="vertical", legend.margin=margin(),
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black"), 
        axis.text.x = element_text(size = 30)) 

print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot_final)

Code

pdf("popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_plot.pdf", useDingbats = F, width = 30, height = 25)
print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot_final)
dev.off()

quartz_off_screen 
                2

Divergent copies

Divergent copies of the shared region

Code

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_homologousRegion %>% 
                                                                                                  filter(s_Sample %!in% popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_closeToPerfectCopies$s_Sample) , minPopSize = 1)




# select just the major haplotypes and cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies %>% 
  group_by() %>% 
  filter(samp_n > 0.9*max(samp_n)) %>% 
  group_by(s_Sample, p_name) %>% 
  
  # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat)
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_dist)


nameOrderFromAll = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist_hclust$order]
orderForDivergentCopy = nameOrderFromAll[nameOrderFromAll  %in% rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat)]




#rename the levels so they are in the order of the clustering 
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies %>% 
  mutate(s_Sample = factor(s_Sample, 
                           # levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_dist_hclust$order]))%>% 
                           levels = orderForDivergentCopy)) %>% 
  mutate(popid = ifelse(maxPopid == 1, -1, popid))
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies$p_name)), 
                     labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies$p_name), 
                     expand = c(0,0))+ 
  scale_y_continuous(expand = c(0,0))


meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies = meta_preferredSample %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies$s_Sample) %>% 
  mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies$s_Sample)))

allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])}

previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_plot)$data[[1]][["fill"]])
names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies$popid))
previousColors["-1"] = "grey0";
allColors = c(allColors, previousColors)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies %>% 
  mutate(s_Sample = factor(s_Sample, 
                           # levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_dist_hclust$order])) %>% 
                           levels = orderForDivergentCopy)) %>% 
  mutate(popid= factor(popid))
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$p_name))), 
                     labels = c("Chr11DupHapCluster", "continent", "region", "country", 
                                rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$p_name)))
                                # levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$p_name)
                                ), 
                     expand = c(0,0))+ 
  scale_y_continuous(
    expand = c(0, 0),
    breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$s_Sample)),
    labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$s_Sample)
  )



popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry = 
  popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
  ungroup() %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$s_Sample) %>% 
  mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$s_Sample))) %>% 
  mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0"))) %>% 
  arrange(BiologicalSample)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry %>% 
  select(Chr11DupHapClusterName, colors) %>% 
  unique() %>% 
  arrange(Chr11DupHapClusterName)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColorsDf$colors
names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName





popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot + 
  scale_fill_manual("Microhaplotype\nRank",  values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]))  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill() + 
  geom_rect(aes(xmin= 0, xmax = -4.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies) + 
  scale_fill_manual("country",  values = rowAnnoColors[["country"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  
  geom_rect(aes(xmin= -5, xmax = -9.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies) + 
  scale_fill_manual("region",  values = rowAnnoColors[["region"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -10, xmax = -14.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies)+ 
  scale_fill_manual("Continent",  values = rowAnnoColors[["continent"]])  + 
  guides(fill = guide_legend(nrow = 3)) +
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -15, xmax = -19.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry)+
                # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ 
  # scale_fill_manual("HaploGroup",  values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups))))  + 
  #scale_fill_manual("Chr11DupHapCluster",  values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors))  + 
  scale_fill_manual("Chr11DupHapCluster",  values = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColors),
                    breaks = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColors))  + 
  guides(fill = guide_legend(nrow = 4))

The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal

Code

regions_homologousRegion_filt = regions_homologousRegion %>% 
  filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$p_name) %>% 
  mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$p_name)))
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot_final = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot + 
        new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -10, 
                      fill = description), 
                  data = regions_homologousRegion_filt, color  = "black") + 
        scale_fill_manual(values = descriptionColors_homologousRegion, 
                          guide = guide_legend(nrow = 2)) + 
        labs(fill = "Genes\nDescription") + 
  transparentBackground + theme(legend.text = element_text(size = 30), 
        legend.title = element_text(size = 30, face = "bold"), 
        legend.box="vertical", legend.margin=margin(),
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black"), 
        axis.text.x = element_text(size = 30)) 

print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot_final)

Code

pdf("popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_plot.pdf", useDingbats = F, width = 30, height = 30)
print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot_final)
dev.off()

quartz_off_screen 
                2

Perfect chr11 copies

The shared region of the strains with perfect chr11 copies.

Code

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies = HaplotypeRainbows::prepForRainbow(
  popClustering_filt_hrp3_pat1_regions_homologousRegion %>%
    filter(
      s_Sample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample
    ),
  minPopSize = 1
)



# select just the major haplotypes and cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies %>% 
  group_by() %>% 
  # filter(samp_n > 0.9*max(samp_n)) %>% 
  group_by(s_Sample, p_name) %>% 
  
  # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat)
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust_orderDf = 
  tibble(
    BiologicalSample = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust$order]
  ) %>% 
  mutate(byGenomicRegionHclustOrder = row_number())

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
  arrange(newClusterName)

popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_select = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$s_Sample) %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust_orderDf) %>% 
  left_join(meta_preferredSample %>% 
              select(BiologicalSample, country, subRegion, region, secondaryRegion)) %>% 
  arrange(Chr11DupHapCluster, subRegion, country, byGenomicRegionHclustOrder)



#rename the levels so they are in the order of the clustering 
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies %>% 
  mutate(s_Sample = factor(s_Sample, 
                           #levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust$order]))%>%
                           levels = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_select$BiologicalSample) ) %>%
  mutate(popid = ifelse(maxPopid == 1, -1, popid))
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$p_name)), 
                     labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$p_name), 
                     expand = c(0,0))+ 
  scale_y_continuous(expand = c(0,0))







meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies = meta_preferredSample %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$s_Sample) %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_select %>% 
              select(BiologicalSample, newClusterName))%>% 
  mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$s_Sample))) 


allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])}

previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_plot)$data[[1]][["fill"]])
names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$popid))
previousColors["-1"] = "grey0";
allColors = c(allColors, previousColors)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies %>% 
  # mutate(s_Sample = factor(s_Sample, 
  #                          levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust$order])) %>% 
    mutate(s_Sample = factor(s_Sample, 
                           levels = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_select$BiologicalSample)) %>% 
  mutate(popid= factor(popid))
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$p_name))), 
                     labels = c("Chr11DupHapCluster", "continent", "region", "country", 
                                rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$p_name)))
                                # levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$p_name)
                                ), 
                     expand = c(0,0))+ 
  scale_y_continuous(
    expand = c(0, 0),
    breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$s_Sample)),
    labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$s_Sample)
  )




popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry = 
  popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
  ungroup() %>% 
  filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$s_Sample) %>% 
  mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$s_Sample))) %>% 
  mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0"))) %>% 
  arrange(BiologicalSample)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry %>% 
  select(Chr11DupHapClusterName, colors) %>% 
  unique() %>% 
  arrange(Chr11DupHapClusterName)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColorsDf$colors
names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName




popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot + 
  scale_fill_manual("Microhaplotype\nRank",  values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]))  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill() + 
  geom_rect(aes(xmin= 0, xmax = -4.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies) + 
  scale_fill_manual("country",  values = rowAnnoColors[["country"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  
  geom_rect(aes(xmin= -5, xmax = -9.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies) + 
  scale_fill_manual("region",  values = rowAnnoColors[["region"]])  + 
  guides(fill = guide_legend(nrow = 3)) + 
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -10, xmax = -14.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies)+ 
  scale_fill_manual("Continent",  values = rowAnnoColors[["continent"]])  + 
  guides(fill = guide_legend(nrow = 3)) +
  ggnewscale::new_scale_fill()  + 
  geom_rect(aes(xmin= -15, xmax = -19.5,
                ymin = as.numeric(BiologicalSample) - 0.5, 
                ymax = as.numeric(BiologicalSample) + 0.5, 
                fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry)+
  scale_fill_manual("Chr11DupHapCluster",  values = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColors),
                    breaks = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColors))  + 
  guides(fill = guide_legend(nrow = 4))

The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal

Code

regions_homologousRegion_filt = regions_homologousRegion %>% 
  filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$p_name) %>% 
  mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$p_name)))

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot_final = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot + 
        new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -10, 
                      fill = description), 
                  data = regions_homologousRegion_filt, color  = "black") + 
        scale_fill_manual(values = descriptionColors_homologousRegion, 
                          guide = guide_legend(nrow = 2)) + 
        labs(fill = "Genes\nDescription")  + 
  transparentBackground + theme(legend.text = element_text(size = 30), 
        legend.title = element_text(size = 30, face = "bold"), 
        legend.box="vertical", legend.margin=margin(),
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black"), 
        axis.text.x = element_text(size = 30)) 
print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot_final)

Code

pdf("popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_plot.pdf", useDingbats = F, width = 30, height = 30)
print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot_final)
dev.off()

quartz_off_screen 
                2

Sub set

SD01, HB3, Santa-Lucia-Salvador-I

Code

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_homologousRegion %>% 
                                                                                                  filter(s_Sample %in% c("HB3", "SD01", "Santa-Lucia-Salvador-I")) , minPopSize = 1)


# select just the major haplotypes and cluster based on the sharing between
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates %>% 
  group_by(p_name) %>% 
  mutate(sampleCount = length(unique(s_Sample)))%>% 
  group_by()%>% 
  filter(sampleCount > 0.9*max(sampleCount)) %>% 
  group_by(s_Sample, p_name) %>% 
  
  # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp)])
rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_mat) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp$s_Sample
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_mat)
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_dist)

#rename the levels so they are in the order of the clustering 
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_dist_hclust$order]))
popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name)), 
                     #labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name), 
                     labels = rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name))), 
                     expand = c(0,0))+ 
  scale_y_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$s_Sample)),
                     labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$s_Sample),
                     expand = c(0,0))




previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot)$data[[1]][["fill"]])
names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$popid))
previousColors["-1"] = "grey0";


popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates %>% 
  mutate(popid= factor(popid)), colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name)), 
                     #labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name), 
                     labels = rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name))), 
                     expand = c(0,0))+ 
  scale_y_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$s_Sample)),
                     labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$s_Sample),
                     expand = c(0,0))

The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal

Code

regions_homologousRegion_filt = regions_homologousRegion %>% 
  filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name) %>% 
  mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name)))

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot_final = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot + 
  scale_fill_manual("Microhaplotype\nRank",   values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]))  +
  guides(fill = guide_legend(nrow = 1)) +
  ggnewscale::new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -1, 
                      fill = description), 
                  data = regions_homologousRegion_filt, color  = "black") + 
        scale_fill_manual(values = descriptionColors_homologousRegion, 
                          guide = guide_legend(nrow = 2))  + 
        labs(fill = "Genes\nDescription")  + 
  transparentBackground + theme(legend.text = element_text(size = 30), 
        legend.title = element_text(size = 30, face = "bold"), 
        legend.box="vertical", legend.margin=margin(),
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black"), 
        axis.text.x = element_text(size = 30), 
        axis.text.y = element_text(size = 30)) 

print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot_final)

Code

pdf("popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot.pdf", useDingbats = F, width = 30, height = 10)
print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot_final)
dev.off()

quartz_off_screen 
                2

Getting regions with SD01 Multi In Shared

Outputting the regions within the shared region where SD01 has multiple variants

Code

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sd01 = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates %>% 
  filter(s_Sample == "SD01") %>% 
  group_by(s_Sample, p_name) %>% 
  mutate(s_COI = length(unique(h_popUID)))

popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sd01_multi = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sd01 %>% 
  filter(s_COI > 1)

regions_withSD01MultiInShared = regions %>% 
  filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sd01_multi$p_name)
write_tsv(regions_withSD01MultiInShared, "regions_withSD01MultiInShared.bed")

finalHrpSubwindows_regions_withSD01MultiInShared = finalHrpSubwindows %>% 
  filter(X4 %in% regions_withSD01MultiInShared$name)
write_tsv(finalHrpSubwindows_regions_withSD01MultiInShared, "finalHrpSubwindows_regions_withSD01MultiInShared.bed", col_names = F)

Plottng Shared Region for Pacbio Genomes

Code

popClustering_labIso = popClustering %>% 
  left_join(meta %>% 
              rename(s_Sample = BiologicalSample)) %>% 
  filter(grepl("^Pf", sample))

popClustering_labIso_homologousRegion = popClustering_labIso %>% 
  filter(p_name %in% regions_homologousRegion$genomicID)

popClustering_labIso_homologousRegion_prep = HaplotypeRainbows::prepForRainbow(popClustering_labIso_homologousRegion, minPopSize = 1)

Code

# select just the major haplotypes and cluster based on the sharing between
popClustering_labIso_homologousRegion_prep_sp = popClustering_labIso_homologousRegion_prep %>% 
  group_by(p_name) %>% 
  mutate(sampleCount = length(unique(s_Sample)))%>% 
  group_by()%>% 
  filter(sampleCount > 0.9*max(sampleCount)) %>% 
  group_by(s_Sample, p_name) %>% 
  
  # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% 
  mutate(marker = 1) %>% 
  group_by() %>% 
  select(h_popUID, marker, s_Sample) %>%   
  spread(h_popUID, marker, fill = 0)

popClustering_labIso_homologousRegion_prep_sp_mat = as.matrix(popClustering_labIso_homologousRegion_prep_sp[,2:ncol(popClustering_labIso_homologousRegion_prep_sp)])
rownames(popClustering_labIso_homologousRegion_prep_sp_mat) = popClustering_labIso_homologousRegion_prep_sp$s_Sample
popClustering_labIso_homologousRegion_prep_sp_dist = dist(popClustering_labIso_homologousRegion_prep_sp_mat)
popClustering_labIso_homologousRegion_prep_sp_dist_hclust = hclust(popClustering_labIso_homologousRegion_prep_sp_dist)

#rename the levels so they are in the order of the clustering 
popClustering_labIso_homologousRegion_prep = popClustering_labIso_homologousRegion_prep %>% 
  mutate(s_Sample = factor(s_Sample, 
                           levels = rownames(popClustering_labIso_homologousRegion_prep_sp_mat)[popClustering_labIso_homologousRegion_prep_sp_dist_hclust$order]))
popClustering_labIso_homologousRegion_prep_plot = genRainbowHapPlotObj(popClustering_labIso_homologousRegion_prep, colorCol = popid) +
  theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + 
  scale_x_continuous(breaks = 1:length(levels(popClustering_labIso_homologousRegion_prep$p_name)), 
                     labels = levels(popClustering_labIso_homologousRegion_prep$p_name), 
                     expand = c(0,0))+ 
  scale_y_continuous(expand = c(0,0))

The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal

Code

regions_homologousRegion_filt = regions_homologousRegion %>% 
  filter(genomicID %in% popClustering_labIso_homologousRegion_prep$p_name) %>% 
  mutate(genomicID = factor(genomicID, levels = levels(popClustering_labIso_homologousRegion_prep$p_name)))

print(popClustering_labIso_homologousRegion_prep_plot + 
        new_scale_fill() + 
        geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, 
                      xmax = as.numeric(genomicID) + 0.5, 
                      ymax = 0, 
                      ymin = -1, 
                      fill = description), 
                  data = regions_homologousRegion_filt, color  = "black") + 
        scale_fill_manual(values = descriptionColors_homologousRegion, 
                          guide = guide_legend(nrow = 2))
)

Plotting whole genome inter-relatednesss between strains with deletions

Code

allSel = readr::read_tsv("/Users/nick/Dropbox (Personal)/ownCloud/documents/plasmodium/falciparum/pfepipanels/Pf_Epi_Panels/data/MIPSIBC/data/pf/reports/slim_allSelectedClustersInfo.tab.txt.gz")

allSel_filt = allSel %>% 
  filter(s_Sample %in% previousDeletionCalls$BiologicalSample)

write_tsv(allSel_filt, "MIPSIBC_previousDeletionCalls_samples.tsv")

Code

allSel = readr::read_tsv("/Users/nick/Dropbox (Personal)/ownCloud/documents/plasmodium/falciparum/pfepipanels/Pf_Epi_Panels/data/heome1/data/pf/reports/slim_allSelectedClustersInfo.tab.txt.gz")

allSel_filt = allSel %>% 
  filter(s_Sample %in% previousDeletionCalls$BiologicalSample)

write_tsv(allSel_filt, "heome1_previousDeletionCalls_samples.tsv")

Code

elucidator doPairwiseComparisonOnHapsSharing --tableFnp heome1_previousDeletionCalls_samples.tsv --sampleCol s_Sample --targetNameCol p_name --popIDCol h_popUID --relAbundCol c_AveragedFrac --numThreads 12 --dout heome1_previousDeletionCalls_samples_pairwiseComps --verbose --overWriteDir --metaFnp /tank/data/plasmodium/falciparum/pfdata/metadata/metaByBiosample.tab.txt --metaFieldsToCalcPopDiffs country,region,secondaryRegion --writeOutDistMatrices 

elucidator doPairwiseComparisonOnHapsSharing --tableFnp MIPSIBC_previousDeletionCalls_samples.tsv --sampleCol s_Sample --targetNameCol p_name --popIDCol h_popUID --relAbundCol c_AveragedFrac --numThreads 12 --dout MIPSIBC_previousDeletionCalls_samples_pairwiseComps --verbose --overWriteDir --metaFnp /tank/data/plasmodium/falciparum/pfdata/metadata/metaByBiosample.tab.txt --metaFieldsToCalcPopDiffs country,region,secondaryRegion --writeOutDistMatrices

Heome1

Code

sample_metadata_withAllDeletionCalls=readr::read_tsv("sample_metadata_withAllDeletionCalls.tsv")

heome1_previousDeletionCalls_samples_jacardByHapsTarShared_samples = readr::read_tsv("heome1_previousDeletionCalls_samples_pairwiseComps/sampleNames.tab.txt", col_names = F)
heome1_previousDeletionCalls_samples_jacardByHapsTarShared = readr::read_tsv("heome1_previousDeletionCalls_samples_pairwiseComps/jacardByHapsTarShared.tab.txt.gz", col_names = F)

heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat = as.matrix(heome1_previousDeletionCalls_samples_jacardByHapsTarShared)
colnames(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat) = heome1_previousDeletionCalls_samples_jacardByHapsTarShared_samples$X1
rownames(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat) = heome1_previousDeletionCalls_samples_jacardByHapsTarShared_samples$X1


heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat = heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat[metaSelected$BiologicalSample,metaSelected$BiologicalSample]


col_fun = colorRamp2(c(min(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat), min(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat) + (1-min(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat))/2, 1), c( "#2166ac", "white", "#b2182b"))

previousDeletionCalls_sel = previousDeletionCalls[match(colnames(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat), previousDeletionCalls$BiologicalSample),]%>% 
  left_join(sample_metadata_withAllDeletionCalls %>% 
              select(sample, Pattern) %>% 
              rename(BiologicalSample = sample)) %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
              mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% 
              mutate(BiologicalSample = as.character(BiologicalSample)) %>% 
              select(BiologicalSample, newClusterName)) 

rowAnnoDf  = previousDeletionCalls_sel[, c(
                                        "country",
                                        "region",
                                        "secondaryRegion", 
                                        "newClusterName", 
                                        "Pattern")] %>% 
  rename(continent = secondaryRegion, 
         Chr11DupHapCluster = newClusterName) %>% 
  as.data.frame()


rowAnnoColors = createColorListFromDf(rowAnnoDf)

load("rowAnnoColors.Rdata")

rowAnnoColors_mod = rowAnnoColors
rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors
rowAnnoColors_mod[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern


annotationTextSize = 25 ;annotationTitleTextSize = 20;

topAnno = HeatmapAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  show_legend = F,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  na_col = "#FFFFFF00"
)
sideAnno = rowAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  na_col = "#FFFFFF00"
)


heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm = Heatmap(
  heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat,
  col = col_fun,
  name = "JacardIndex",
  top_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"),
  na_col = "#FFFFFF00",
  heatmap_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(
      fontsize = annotationTextSize,
      fontface = "bold",
      title = "JacardIndex"
    )
  )
)

Code

draw(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")

Code

pdf("heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm.pdf", width = 25, height = 30, useDingbats = F)
draw(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
dev.off()

quartz_off_screen 
                2

MIPSIBC

Code

sample_metadata_withAllDeletionCalls=readr::read_tsv("sample_metadata_withAllDeletionCalls.tsv")

MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_samples = readr::read_tsv("MIPSIBC_previousDeletionCalls_samples_pairwiseComps/sampleNames.tab.txt", col_names = F)
MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared = readr::read_tsv("MIPSIBC_previousDeletionCalls_samples_pairwiseComps/jacardByHapsTarShared.tab.txt.gz", col_names = F)

MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat = as.matrix(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared)
colnames(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat) = MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_samples$X1
rownames(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat) = MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_samples$X1

MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat = MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat[metaSelected$BiologicalSample,metaSelected$BiologicalSample]


col_fun = colorRamp2(c(min(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat), min(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat) + (1-min(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat))/2, 1), c( "#2166ac", "white", "#b2182b"))





previousDeletionCalls_sel = previousDeletionCalls[match(colnames(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat), previousDeletionCalls$BiologicalSample),]%>% 
  left_join(sample_metadata_withAllDeletionCalls %>% 
              select(sample, Pattern) %>% 
              rename(BiologicalSample = sample)) %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
              ungroup() %>% 
              mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% 
              mutate(BiologicalSample = as.character(BiologicalSample)) %>% 
              select(BiologicalSample, newClusterName)) 

rowAnnoDf  = previousDeletionCalls_sel[, c(
                                        "country",
                                        "region",
                                        "secondaryRegion", 
                                        "newClusterName", 
                                        "Pattern")] %>% 
  rename(continent = secondaryRegion, 
         Chr11DupHapCluster = newClusterName) %>% 
  as.data.frame()


rowAnnoColors = createColorListFromDf(rowAnnoDf)

load("rowAnnoColors.Rdata")

rowAnnoColors_mod = rowAnnoColors
rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors
rowAnnoColors_mod[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern


annotationTextSize = 25 ;annotationTitleTextSize = 20;



topAnno = HeatmapAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  show_legend = F,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  
  na_col = "#FFFFFF00"
)
sideAnno = rowAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  
  na_col = "#FFFFFF00"
)

MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_nolabs = MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat
rownames(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_nolabs) = NULL
colnames(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_nolabs) = NULL

MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm = Heatmap(
  MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_nolabs,
  col = col_fun,
  name = "JacardIndex",
  top_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"),
  na_col = "#FFFFFF00",
  heatmap_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(
      fontsize = annotationTextSize,
      fontface = "bold",
      title = "JacardIndex"
    )
  )
)

Code

draw(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")

Code

pdf("MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm.pdf", width = 25, height = 30, useDingbats = F)
draw(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
dev.off()

quartz_off_screen 
                2

hmmIBD

Code

sample_metadata_withAllDeletionCalls_sel = sample_metadata_withAllDeletionCalls %>%
  select(sample, SRARuns) %>%
  filter(sample %in% previousDeletionCalls$BiologicalSample) %>%
  mutate(SRARuns = strsplit(SRARuns, split = ",")) %>%
  unnest(SRARuns)

hmm_fract = readr::read_tsv("filtered_gatk_calls_database_hrpsallMetaDeletionCalls.hmm_fract.txt")

hmm_fract_combined = bind_rows(
  hmm_fract %>% 
  arrange(sample1, sample2) %>% 
  select(sample1, sample2, fract_sites_IBD), 
  hmm_fract %>% 
  arrange(sample1, sample2) %>% 
  select(sample1, sample2, fract_sites_IBD) %>% 
    rename(temp1 = sample2, 
           temp2 = sample1)%>% 
    rename(sample1 = temp1, 
           sample2 = temp2)
) 

hmm_fract_combined_samples = tibble(sample1 = unique(hmm_fract_combined$sample1)) %>% 
  left_join(sample_metadata_withAllDeletionCalls_sel %>% 
              rename(BiologicalSample = sample, 
                     sample1 = SRARuns)) %>% 
  mutate(BiologicalSample = ifelse(is.na(BiologicalSample), sample1, BiologicalSample)) %>% 
  mutate(BiologicalSample = ifelse(BiologicalSample == "fcr3", "FCR3", BiologicalSample))

perSampleVarCounts = readr::read_tsv("filtered_hrpsallMetaDeletionCalls_variants_perSampleCounts.tsv") %>% 
  rename(sample1 = `[3]sample`, 
         nMissing = `[14]nMissing`)

hmm_fract_combined_samples_filt = hmm_fract_combined_samples %>% 
  left_join(perSampleVarCounts %>% 
              select(sample1, nMissing)) %>% 
  group_by(BiologicalSample) %>% 
  arrange(BiologicalSample, nMissing) %>% 
  mutate(rank = row_number(), 
         totalSamples = n()) %>% 
  filter(rank == 1)



hmm_fract_combined_filt = hmm_fract_combined %>% 
  filter(sample1 %in% hmm_fract_combined_samples_filt$sample1, 
         sample2 %in% hmm_fract_combined_samples_filt$sample1) %>% 
  left_join(hmm_fract_combined_samples_filt %>% 
              rename(BiologicalSample1 = BiologicalSample) %>% 
              select(sample1, BiologicalSample1))%>% 
  left_join(hmm_fract_combined_samples_filt %>% 
              rename(
                sample2 = sample1,
                BiologicalSample2 = BiologicalSample) %>% 
              select(sample2, BiologicalSample2))

hmm_fract_sp = hmm_fract_combined_filt %>% 
  select(BiologicalSample1, BiologicalSample2, fract_sites_IBD) %>% 
  spread(BiologicalSample2, fract_sites_IBD, fill = 1)

hmm_fract_sp_mat = as.matrix(hmm_fract_sp[,2:ncol(hmm_fract_sp)])
rownames(hmm_fract_sp_mat) = hmm_fract_sp$BiologicalSample1

Code

library(circlize)

hmm_fract_sp_mat = hmm_fract_sp_mat[metaSelected$BiologicalSample, metaSelected$BiologicalSample]


# col_fun = colorRamp2(c(min(hmm_fract_sp_mat), min(hmm_fract_sp_mat) + (1-min(hmm_fract_sp_mat))/2, 1), c( "#2166ac", "white", "#b2182b"))
col_fun = colorRamp2(c(0, 0.5, 1), c( "#2166ac", "white", "#b2182b"))

previousDeletionCalls_sel = previousDeletionCalls[match(colnames(hmm_fract_sp_mat), previousDeletionCalls$BiologicalSample),]%>% 
  left_join(sample_metadata_withAllDeletionCalls %>% 
              select(sample, Pattern) %>% 
              rename(BiologicalSample = sample)) %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
              ungroup() %>% 
              mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% 
              mutate(BiologicalSample = as.character(BiologicalSample)) %>% 
              select(BiologicalSample, newClusterName))  %>% 
  mutate(Pattern = ifelse(is.na(Pattern) & possiblyHRP2Deleted, "8-TARE1", Pattern)) %>% 
  mutate(Pattern = ifelse("FCR3" == BiologicalSample, "13++11-", Pattern)) 

rowAnnoDf  = previousDeletionCalls_sel[, c(
                                        "country",
                                        "region",
                                        "secondaryRegion", 
                                        "newClusterName", 
                                        "Pattern", 
                                        "hrpCall")] %>% 
  rename(continent = secondaryRegion, 
         Chr11DupHapCluster = newClusterName) %>% 
  as.data.frame()


rowAnnoColors = createColorListFromDf(rowAnnoDf)

load("rowAnnoColors.Rdata")

rowAnnoColors_mod = rowAnnoColors
rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors
rowAnnoColors_mod[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern


annotationTextSize = 25 ;annotationTitleTextSize = 20;


topAnno = HeatmapAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  show_legend = F,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  na_col = "#FFFFFF00"
)
sideAnno = rowAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  na_col = "#FFFFFF00"
)

hmm_fract_sp_mat_nolabs = hmm_fract_sp_mat
rownames(hmm_fract_sp_mat_nolabs) = NULL
colnames(hmm_fract_sp_mat_nolabs) = NULL

hmm_fract_sp_mat_hm = Heatmap(
  hmm_fract_sp_mat_nolabs,
  col = col_fun,
  name = "fracIBDSites",
  top_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"),
  na_col = "#FFFFFF00" ,
  heatmap_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(
      fontsize = annotationTextSize,
      fontface = "bold",
      title = "fracIBDSites"
    )
  )
)

Code

draw(hmm_fract_sp_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")

Code

pdf("hmmIBD_fract_sp_mat_hm.pdf", width = 25, height = 30, useDingbats = F)
draw(hmm_fract_sp_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
dev.off()

quartz_off_screen 
                2

hmmIBD for just 13-11++ parasites

Code

hmm_fract_sp_pat1 = hmm_fract_combined_filt %>% 
  filter(BiologicalSample1 %in% metaSelected_hrp3_pat1$BiologicalSample, 
         BiologicalSample2 %in% metaSelected_hrp3_pat1$BiologicalSample) %>% 
  select(BiologicalSample1, BiologicalSample2, fract_sites_IBD) %>% 
  spread(BiologicalSample2, fract_sites_IBD, fill = 1)

hmm_fract_sp_pat1_mat = as.matrix(hmm_fract_sp_pat1[,2:ncol(hmm_fract_sp_pat1)])
rownames(hmm_fract_sp_pat1_mat) = hmm_fract_sp_pat1$BiologicalSample1

Code

library(circlize)


# col_fun = colorRamp2(c(min(hmm_fract_sp_pat1_mat), min(hmm_fract_sp_pat1_mat) + (1-min(hmm_fract_sp_pat1_mat))/2, 1), c( "#2166ac", "white", "#b2182b"))
col_fun = colorRamp2(c(0, 0.5, 1), c( "#2166ac", "white", "#b2182b"))

previousDeletionCalls_sel = previousDeletionCalls[match(colnames(hmm_fract_sp_pat1_mat), previousDeletionCalls$BiologicalSample),]%>% 
  left_join(sample_metadata_withAllDeletionCalls %>% 
              select(sample, Pattern) %>% 
              rename(BiologicalSample = sample)) %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
              ungroup() %>% 
              mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% 
              mutate(BiologicalSample = as.character(BiologicalSample)) %>% 
              select(BiologicalSample, newClusterName)) 

rowAnnoDf  = previousDeletionCalls_sel[, c(
                                        "country",
                                        "region",
                                        "secondaryRegion", 
                                        "newClusterName", 
                                        # "Pattern", 
                                        "hrpCall")] %>% 
  rename(continent = secondaryRegion, 
         Chr11DupHapCluster = newClusterName) %>% 
  as.data.frame()


rowAnnoColors = createColorListFromDf(rowAnnoDf)

load("rowAnnoColors.Rdata")

rowAnnoColors_mod = rowAnnoColors

rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors
rowAnnoColors_mod[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern




annotationTextSize = 25 ;annotationTitleTextSize = 20;


topAnno = HeatmapAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  show_legend = F,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  na_col = "#FFFFFF00"
)
sideAnno = rowAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  na_col = "#FFFFFF00"
)

hmm_fract_sp_pat1_mat_nolabs = hmm_fract_sp_pat1_mat
rownames(hmm_fract_sp_pat1_mat_nolabs) = NULL
colnames(hmm_fract_sp_pat1_mat_nolabs) = NULL

hmm_fract_sp_pat1_mat_hm = Heatmap(
  hmm_fract_sp_pat1_mat_nolabs,
  col = col_fun,
  name = "fracIBDSites",
  top_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"),
  na_col = "#FFFFFF00" ,
  heatmap_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(
      fontsize = annotationTextSize,
      fontface = "bold",
      title = "fracIBDSites"
    )
  )
)

Code

draw(hmm_fract_sp_pat1_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")

Code

pdf("hmmIBD_fract_sp_mat_hm_pat1.pdf", width = 25, height = 25, useDingbats = F)
draw(hmm_fract_sp_pat1_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
dev.off()

quartz_off_screen 
                2

hmmIBD for just 13-5++ parasites

Code

sample_metadata_withAllDeletionCalls_13_5_pattern = sample_metadata_withAllDeletionCalls %>% 
  filter(Pattern == "13-5++")
hmm_fract_sp_13_5_pattern = hmm_fract_combined_filt %>% 
  filter(BiologicalSample1 %in% sample_metadata_withAllDeletionCalls_13_5_pattern$sample, 
         BiologicalSample2 %in% sample_metadata_withAllDeletionCalls_13_5_pattern$sample) %>% 
  select(BiologicalSample1, BiologicalSample2, fract_sites_IBD) %>% 
  spread(BiologicalSample2, fract_sites_IBD, fill = 1)

hmm_fract_sp_13_5_pattern_mat = as.matrix(hmm_fract_sp_13_5_pattern[,2:ncol(hmm_fract_sp_13_5_pattern)])
rownames(hmm_fract_sp_13_5_pattern_mat) = hmm_fract_sp_13_5_pattern$BiologicalSample1

Code

library(circlize)


# col_fun = colorRamp2(c(min(hmm_fract_sp_13_5_pattern_mat), min(hmm_fract_sp_13_5_pattern_mat) + (1-min(hmm_fract_sp_13_5_pattern_mat))/2, 1), c( "#2166ac", "white", "#b2182b"))
col_fun = colorRamp2(c(0, 0.5, 1), c( "#2166ac", "white", "#b2182b"))

previousDeletionCalls_sel = previousDeletionCalls[match(colnames(hmm_fract_sp_13_5_pattern_mat), previousDeletionCalls$BiologicalSample),]%>% 
  left_join(sample_metadata_withAllDeletionCalls %>% 
              select(sample, Pattern) %>% 
              rename(BiologicalSample = sample)) %>% 
  left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% 
              ungroup() %>% 
              mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% 
              mutate(BiologicalSample = as.character(BiologicalSample)) %>% 
              select(BiologicalSample, newClusterName)) 

rowAnnoDf  = previousDeletionCalls_sel[, c(
                                        "country",
                                        "region",
                                        "secondaryRegion" 
                                        # "newClusterName", 
                                        # "Pattern", 
                                        # "hrpCall"
                                        )] %>% 
  rename(continent = secondaryRegion) %>% 
  as.data.frame()


rowAnnoColors = createColorListFromDf(rowAnnoDf)

load("rowAnnoColors.Rdata")

rowAnnoColors_mod = rowAnnoColors

# rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors
rowAnnoColors_mod[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern




annotationTextSize = 25 ;annotationTitleTextSize = 20;


topAnno = HeatmapAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  show_legend = F,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  na_col = "#FFFFFF00"
)
sideAnno = rowAnnotation(
  df = rowAnnoDf,
  col = rowAnnoColors_mod,
  gp = gpar(col = "grey10"),
  annotation_name_gp = gpar(fontsize = annotationTitleTextSize),
  annotation_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(fontsize = annotationTextSize, fontface = "bold")
  ),
  na_col = "#FFFFFF00"
)

hmm_fract_sp_13_5_pattern_mat_nolabs = hmm_fract_sp_13_5_pattern_mat
rownames(hmm_fract_sp_13_5_pattern_mat_nolabs) = NULL
colnames(hmm_fract_sp_13_5_pattern_mat_nolabs) = NULL

hmm_fract_sp_13_5_pattern_mat_hm = Heatmap(
  hmm_fract_sp_13_5_pattern_mat_nolabs,
  col = col_fun,
  name = "fracIBDSites",
  top_annotation = topAnno,
  left_annotation = sideAnno,
  row_dend_width = unit(5, "cm"),
  column_dend_height = unit(5, "cm"),
  na_col = "#FFFFFF00" ,
  heatmap_legend_param = list(
    labels_gp = gpar(fontsize = annotationTextSize),
    title_gp = gpar(
      fontsize = annotationTextSize,
      fontface = "bold",
      title = "fracIBDSites"
    )
  )
)

Code

draw(hmm_fract_sp_13_5_pattern_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")

Code

pdf("hmmIBD_fract_sp_mat_hm_13_5_pattern.pdf", width = 15, height = 15, useDingbats = F)
draw(hmm_fract_sp_13_5_pattern_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
dev.off()

quartz_off_screen 
                2

moire

Code

remotes::install_github("EPPIcenter/moire")

setwd("/tank/data/plasmodium/falciparum/pfdata/moire_on_hrp3Samples/")
df <- read.tsv("allSel_withDeletions_prep_outForMoire.tsv")
data <- load_long_form_data(df)

# With data in appropriate format, run MCMC as follows
mcmc_results <- moire::run_mcmc(data, is_missing = data$is_missing)
write_rds(mcmc_results, "mcmc_results.rds")
write_rds(data, "data_for_moire.rds")

Code

data_for_moire = read_rds("moire_on_hrp3Samples/data_for_moire.rds")
mcmc_results = read_rds("moire_on_hrp3Samples/mcmc_results.rds")

coiEsts = tibble(
  sampleID = data_for_moire$sample_ids, 
  medianCOI = round(unlist(lapply(mcmc_results$chains[[1]]$coi, median))), 
  meanCOI = round(unlist(lapply(mcmc_results$chains[[1]]$coi, mean))),
  maxCOI = round(unlist(lapply(mcmc_results$chains[[1]]$coi, max)))
)

--- title: "Plotting haplotype variation within regions" --- ```{r setup, echo=FALSE, message=FALSE} source("../common.R") ``` ## Reading in data ```{r} load("rowAnnoColors.Rdata") ``` ## Downloads ```{r} #| results: asis #| echo: false cat(createDownloadLink("../meta/metadata/meta.tab.txt")) cat(createDownloadLink("../meta/metadata/metaByBioSample")) cat(createDownloadLink("wgs_variants/THEREALMcCOIL/categorical_method/real_mccoil_COI_calls.tsv")) cat(createDownloadLink("finalHRPII_HRPIII_windows_withTunedSubWindows/popClustering/reports/slim_allSelectedClustersInfo.tab.txt.gz")) cat(createDownloadLink("metaSelected.tab.txt")) cat(createDownloadLink("allMeta_HRP2_HRP3_deletionCalls.tab.txt")) cat(createDownloadLink("subwindows_regionMeta.tab.txt")) ``` ```{r} meta = readr::read_tsv("../meta/metadata/meta.tab.txt") %>% mutate(country = gsub("South East Asia - East", "Cambodia", country)) metaByBioSample = readr::read_tsv("../meta/metadata/metaByBioSample.tab.txt") %>% mutate(country = gsub("South East Asia - East", "Cambodia", country)) # coiCalls = readr::read_tsv("/Users/nick/Dropbox (Personal)/ownCloud/documents/plasmodium/falciparum/pfepipanels/Pf_Epi_Panels/data/MAD4HATTER/data/pf/COI_calls.tab.txt") # coiCalls_poly = coiCalls %>% # filter(COI > 1) coiCalls = readr::read_tsv("heome1_COI_calls.tab.txt") #coiCalls = readr::read_tsv("PfSMART_COI_calls.tab.txt") coiCalls_poly = coiCalls %>% filter(COI > 1) realmccoilCoiCalls = readr::read_tsv("wgs_variants/THEREALMcCOIL/categorical_method/real_mccoil_COI_calls.tsv") realmccoilCoiCalls_poly = realmccoilCoiCalls %>% filter(random_median != 1 | topHE_median != 1) previousDeletionCalls = readr::read_tsv("allMeta_HRP2_HRP3_deletionCalls.tab.txt") %>% #filter(country %!in% c("Bangladesh", "Mauritania", "Myanmar", "The Gambia")) %>% #filter(((grepl("SPT", sample) & possiblyChr11Deleted))) %>% #filter(BiologicalSample %!in% coiCalls_poly$sample) %>% mutate(country = gsub("South East Asia - East", "Cambodia", country)) meta = meta %>% left_join(previousDeletionCalls)%>% mutate(hrpCall = case_when( possiblyHRP2Deleted & possiblyHRP3Deleted ~ "pfhrp2-/pfhrp3-", possiblyHRP2Deleted & !possiblyHRP3Deleted ~ "pfhrp2-/pfhrp3+", !possiblyHRP2Deleted & possiblyHRP3Deleted ~ "pfhrp2+/pfhrp3-", T ~ "pfhrp2+/pfhrp3+" )) %>% left_join(realmccoilCoiCalls %>% select(BiologicalSample, topHE_median) %>% rename(COI = topHE_median)) # left_join(coiCalls %>% # rename(BiologicalSample = sample)) homologousRegion = readr::read_tsv("../rRNA_segmental_duplications/sharedBetween11_and_13/investigatingChrom11Chrom13/Pf3D7_13_v3-2792021-2807295-for--Pf3D7_11_v3-1918028-1933288-for.bed", col_names = F) regions = readr::read_tsv("subwindows_regionMeta.tab.txt") metaSelected = readr::read_tsv("metaSelected.tab.txt") %>% #select(-COI) %>% left_join(realmccoilCoiCalls %>% select(BiologicalSample, topHE_median) %>% rename(COI = topHE_median)) %>% filter(COI == 1) metaSelected_hrp2_deleted = metaSelected %>% filter(possiblyHRP2Deleted) metaSelected_hrp3_deleted = metaSelected %>% filter(possiblyHRP3Deleted) metaSelected_hrp2_and_hrp3_deleted = metaSelected %>% filter(possiblyHRP2Deleted, possiblyHRP3Deleted) regions_key = regions %>% select(name, genomicID) ``` ```{r} finalHrpSubwindows = readr::read_tsv("../windowAnalysis/windows/finalHRPII_HRPIII_windows_withTuned_combinedVarConservedRegions.bed", col_names = F) erroneousRegions = c("Pf3D7_11_v3-1944071-1944237", "Pf3D7_11_v3-1944083-1944229", "Pf3D7_11_v3-1938175-1938354") samplesCovered = readr::read_tsv("samplesCovered.txt", col_names = "sample") %>% left_join(meta %>% select(sample, BiologicalSample)) ``` ```{r} popClustering = readr::read_tsv("finalHRPII_HRPIII_windows_withTunedSubWindows/popClustering/reports/slim_allSelectedClustersInfo.tab.txt.gz") # regions_key = regions_key %>% mutate(duplicationRegion = grepl("for", name)) # renaming and duplicate the dup region popClustering = popClustering %>% left_join(regions_key %>% rename(p_name = name)) %>% mutate(p_name = genomicID) %>% mutate(h_popUID = paste0(genomicID, "--", h_popUID)) # popClustering_filt = popClustering %>% filter(s_Sample %fin% metaSelected$BiologicalSample) %>% filter(genomicID %!in% erroneousRegions) previousDeletionCalls_hrp3Delete = previousDeletionCalls %>% filter(possiblyHRP3Deleted) popClustering_filt_hrp3Delete = popClustering_filt %>% filter(s_Sample %in% previousDeletionCalls_hrp3Delete$BiologicalSample) regions_afterHomologous = regions %>% filter(afterHomologousRegion) ``` ```{r} allDeletionTypeMeta = readr::read_tsv("allMetaDeletionCalls.tab.txt") %>% filter(BiologicalSample %in% metaSelected$BiologicalSample) allDeletionTypeMeta_hrp3_pat1 = allDeletionTypeMeta %>% filter(HRP3_deletionPattern == "Pattern 1") popClustering_filt_hrp3_pat1 = popClustering_filt %>% filter(s_Sample %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample) allDeletionTypeMeta_hrp3_pat2 = allDeletionTypeMeta %>% filter(HRP3_deletionPattern == "Pattern 2") popClustering_filt_hrp3_pat2 = popClustering_filt %>% filter(s_Sample %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample) allDeletionTypeMeta_deletionPatternCounts = allDeletionTypeMeta %>% filter(!is.na(HRP3_deletionPattern)) %>% group_by(HRP3_deletionPattern) %>% count() create_dt(allDeletionTypeMeta_deletionPatternCounts) ``` ## Pattern 2 ```{r} allDeletionTypeMeta_hrp3_pat2_count_country = allDeletionTypeMeta_hrp3_pat2 %>% group_by(country, region, secondaryRegion) %>% count() create_dt(allDeletionTypeMeta_hrp3_pat2_count_country) allDeletionTypeMeta_hrp3_pat2_count_region = allDeletionTypeMeta_hrp3_pat2 %>% group_by(region, secondaryRegion) %>% count() create_dt(allDeletionTypeMeta_hrp3_pat2_count_region) allDeletionTypeMeta_hrp3_pat2_count_continent = allDeletionTypeMeta_hrp3_pat2 %>% group_by(secondaryRegion) %>% count() create_dt(allDeletionTypeMeta_hrp3_pat2_count_continent) ``` ## 13-11++ ### Chr 11 duplicated region #### Getting chr 11 duplication conserved counts Below is code determining the samples with possible chr11 fragment duplication and breaking down the counts of perfect duplicated copies vs divergent copies. ```{r} regions_afterHomologous_chr11 = regions %>% filter(`#chrom` == "Pf3D7_11_v3", afterHomologousRegion) regions_afterHomologous_chr11 = regions_afterHomologous_chr11 %>% mutate(description = case_when( grepl("extraField0=NA", extraField0) ~ "intergenic", T ~ gsub("\\]", "", gsub(".*description=", "", extraField0)) ) ) descriptionColors = scheme$hex(length(regions_afterHomologous_chr11$description %>% unique())) names(descriptionColors) = regions_afterHomologous_chr11$description %>% unique() descriptionColors["intergenic"] = c("#FF000000") ``` ```{r} popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 = popClustering_filt_hrp3_pat1 %>% filter(p_name %in% regions_afterHomologous_chr11$genomicID) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% group_by(s_Sample, p_name) %>% mutate(uniqHaps= n()) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_uniqSum = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% group_by(s_Sample) %>% mutate(targets = length(unique(genomicID))) %>% group_by(s_Sample, targets, uniqHaps) %>% count() %>% mutate(freq = n/targets) minafCutoff = 0.15 popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% filter(c_AveragedFrac > minafCutoff) %>% group_by(s_Sample, p_name) %>% mutate(uniqHaps= n()) %>% mutate(marker = uniqHaps == 1) %>% group_by(s_Sample) %>% summarise(conserved = sum(marker), targets = length(unique(genomicID))) %>% mutate(conservedID = conserved/targets) conservedCutOff = 0.99 popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>% filter(conservedID > conservedCutOff) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOff = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>% mutate(marker = conservedID > conservedCutOff) %>% group_by() %>% summarise(perfectDuplication = sum(marker), totalSamps = length(unique(s_Sample))) %>% mutate(perfectCopyFreq = perfectDuplication/totalSamps) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>% mutate(marker = conservedID > conservedCutOff) %>% left_join(metaByBioSample %>% rename(s_Sample = sample)) %>% group_by(country, region) %>% summarise(perfectDuplication = sum(marker), totalSamps = length(unique(s_Sample))) %>% mutate(perfectCopyFreq = perfectDuplication/totalSamps) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByRegion = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>% mutate(marker = conservedID > conservedCutOff) %>% left_join(metaByBioSample %>% rename(s_Sample = sample)) %>% group_by(region) %>% summarise(perfectDuplication = sum(marker), totalSamps = length(unique(s_Sample))) %>% mutate(perfectCopyFreq = perfectDuplication/totalSamps) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByContinent = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>% mutate(marker = conservedID > conservedCutOff) %>% left_join(metaByBioSample %>% rename(s_Sample = sample)) %>% group_by(secondaryRegion) %>% summarise(perfectDuplication = sum(marker), totalSamps = length(unique(s_Sample))) %>% mutate(perfectCopyFreq = perfectDuplication/totalSamps) ``` The number of samples with perfect copies ```{r} create_dt(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOff) ``` The number of samples with perfect copies broken down by country ```{r} create_dt(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByCountry) ``` The number of samples with perfect copies broken down by regions ```{r} create_dt(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByRegion) ``` The number of samples with perfect copies broken down by continent. ```{r} create_dt(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_cutOffByContinent) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_meanId = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum %>% filter(conservedID <= conservedCutOff) %>% summarise(meanID = mean(conservedID), minID = min(conservedID), sdID = sd(conservedID)) ``` The breakdown of level of divergence in the samples with divergent samples. ```{r} create_dt(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_meanId) ``` ## Population analysis of chr11 duplicated region Calculating the population of the haplotypes after the shared region on chr 11, the duplicated region to see if there is any population signal associated with the duplicated copy. E.g. if the copy is unique to a subset of haplotypes, if the copy is always perfect or if there is variation. ```{r} popClustering_filt_regions_afterHomologous_chr11 = popClustering %>% filter(genomicID %!in% erroneousRegions) %>% filter(p_name %in% regions_afterHomologous_chr11$genomicID) popClustering_filt_regions_afterHomologous_chr11_tarCounts = popClustering_filt_regions_afterHomologous_chr11 %>% group_by(s_Sample) %>% summarise(tarCounts = length(unique(p_name))) popClustering_filt_regions_afterHomologous_chr11_tarCounts_filt = popClustering_filt_regions_afterHomologous_chr11_tarCounts %>% filter(tarCounts >= 0.80 * max(tarCounts) | s_Sample %in% previousDeletionCalls$BiologicalSample) popClustering_filt_regions_afterHomologous_chr11_sampCounts = popClustering_filt_regions_afterHomologous_chr11 %>% group_by(p_name) %>% summarise(sampCounts = length(unique(s_Sample))) metaByBioSample_out = metaByBioSample %>% left_join(allDeletionTypeMeta %>% select(-sample, -ExperimentSample) %>% rename(sample = BiologicalSample)) write_tsv(metaByBioSample_out, "metaByBioSample_outwithHrpCalls.tab.txt") write_tsv(popClustering_filt_regions_afterHomologous_chr11 %>% filter(s_Sample %in% popClustering_filt_regions_afterHomologous_chr11_tarCounts_filt$s_Sample) %>% group_by() %>% select(s_Sample, p_name, h_popUID, c_AveragedFrac), "popClustering_filt_regions_afterHomologous_chr11.tab.txt.gz") ``` ```{bash, eval = F} elucidator doPairwiseComparisonOnHapsSharingDev --tableFnp popClustering_filt_regions_afterHomologous_chr11.tab.txt.gz --sampleCol s_Sample --targetNameCol p_name --popIDCol h_popUID --relAbundCol c_AveragedFrac --numThreads 14 --dout pairwiseComps_regions_afterHomologous_chr11 --verbose --overWriteDir --metaFnp metaByBioSample_outwithHrpCalls.tab.txt --metaFieldsToCalcPopDiffs country,region,secondaryRegion,HRP3_deletionPattern --writeOutDistMatrices ``` ```{r} #jacardDist = readr::read_tsv("pairwiseComps_regions_afterHomologous_chr11/percOfTarSharingAtLeastOneHap.tab.txt.gz", col_names = F) jacardDist = readr::read_tsv("pairwiseComps_regions_afterHomologous_chr11/jacardByHapsTarShared.tab.txt.gz", col_names = F) jacardDistSamps = readr::read_tsv("pairwiseComps_regions_afterHomologous_chr11/sampleNames.tab.txt", col_names = "samples") colnames(jacardDist) = jacardDistSamps$samples jacardDist$sample = jacardDistSamps$samples # jacardDist_filt = jacardDist[jacardDist$sample %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample,c(allDeletionTypeMeta_hrp3_pat1$BiologicalSample, "sample")] # jacardDist_gat = jacardDist_filt %>% # gather(otherSample, index,1:(ncol(.) - 1)) jacardDist_gat = jacardDist %>% gather(otherSample, index,1:(ncol(.) - 1)) jacardDist_gat_filt = jacardDist_gat %>% filter(sample %fin% allDeletionTypeMeta_hrp3_pat1$BiologicalSample, otherSample %fin% allDeletionTypeMeta_hrp3_pat1$BiologicalSample) jacardDist_gat_filt_sp = jacardDist_gat_filt %>% spread(otherSample, index) jacardDist_gat_filt_sp_mat = as.matrix(jacardDist_gat_filt_sp[,2:ncol(jacardDist_gat_filt_sp)]) rownames(jacardDist_gat_filt_sp_mat) = jacardDist_gat_filt_sp$sample ``` #### Getting cluster groups ```{r} jacardDist_gat_filt_sp_mat_pat1 = jacardDist_gat_filt_sp_mat # get data just for variable regions (e.g., minPopSize = 2) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11, minPopSize = 2) # cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering %>% group_by(p_name) %>% mutate(sampleCount = length(unique(s_Sample))) %>% group_by() %>% filter(sampleCount >= 0.99*max(sampleCount)) %>% # get just targets with high sample coverage otherwise clustering will be by missingness group_by(s_Sample, p_name) %>% #filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp$s_Sample popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_mat) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist) # get clustering based on the jacard distance too for reference jacardDist_gat_filt_sp_mat_pat1_hc = hclust(dist(jacardDist_gat_filt_sp_mat_pat1)) k_groups = 42; h_groups = 1.1; popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups = cutree(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust, k = k_groups) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend <- as.dendrogram(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend <- color_labels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend, k = k_groups) plot(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups = cutree(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust, h = h_groups) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend <- as.dendrogram(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend <- color_labels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend, h = h_groups) plot(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend) pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend.pdf", height = 10, width = 20, useDingbats = F) plot(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_dend) dev.off() jacardDist_gat_filt_sp_mat_pat1_hc_groups = cutree(jacardDist_gat_filt_sp_mat_pat1_hc, k = k_groups) jacardDist_gat_filt_sp_mat_pat1_hc_dend <- as.dendrogram(jacardDist_gat_filt_sp_mat_pat1_hc) jacardDist_gat_filt_sp_mat_pat1_hc_dend <- color_labels(jacardDist_gat_filt_sp_mat_pat1_hc_dend, k = k_groups) plot(jacardDist_gat_filt_sp_mat_pat1_hc_dend) pdf("jacardDist_gat_filt_sp_mat_pat1_hc_dend.pdf", height = 10, width = 20, useDingbats = F) plot(jacardDist_gat_filt_sp_mat_pat1_hc_dend) dev.off() popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df = tibble( BiologicalSample = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups), hcclust = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups ) %>% # mutate(BiologicalSample =factor(BiologicalSample, levels = levels(meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering$BiologicalSample))) %>% group_by(hcclust) %>% mutate(hcclustSize = n()) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_biggerGroups = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% filter(hcclustSize != 1) %>% select(hcclust) %>% unique() popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_singletonGroups = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% filter(hcclustSize == 1) %>% select(hcclust) %>% unique() newscheme <- iwanthue(seed = 626, force_init = TRUE); newscheme$hex(8) # nonSingletonGroupsColors = createColorListFromDf(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_biggerGroups)$hcclust nonSingletonGroupsColors = newscheme$hex(nrow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_biggerGroups)) names(nonSingletonGroupsColors) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_biggerGroups$hcclust nonSingletonGroupsColors_singleton = rep("grey71", nrow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_singletonGroups)) names(nonSingletonGroupsColors_singleton) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_singletonGroups$hcclust haploGroupColors = c(nonSingletonGroupsColors, nonSingletonGroupsColors_singleton) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% select(hcclust, hcclustSize) %>% ungroup() %>% unique() %>% arrange(desc(hcclustSize)) %>% mutate(hcclust = as.character(hcclust), newClusterName = row_number()) %>% left_join(tibble( hcclust = names(haploGroupColors), colors = unname(haploGroupColors) )) %>% mutate(Chr11DupHapCluster = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, pad = "0", width = 2))) newHaploGroupColors = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$colors names(newHaploGroupColors)= popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$newClusterName newHaploGroupWithSingletColors = c(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$colors[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$hcclustSize > 1], "grey77") names(newHaploGroupWithSingletColors)= c(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$Chr11DupHapCluster[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts$hcclustSize > 1], "singlet") popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_groupCounts %>% mutate(hcclust = as.integer(hcclust))) write_tsv(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df, "popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df.tsv") ``` ```{r} library(circlize) #col_fun = colorRamp2(c(0, 0.5, 1), c(heat.colors(3))) col_fun = colorRamp2(c(min(jacardDist_gat_filt_sp_mat), min(jacardDist_gat_filt_sp_mat) + (1-min(jacardDist_gat_filt_sp_mat))/2, 1), c( "#2166ac", "white", "#b2182b")) jacardDist_gat_filt_sp_mat_noLabs = jacardDist_gat_filt_sp_mat jacardDist_gat_filt_sp_mat_pat1 = jacardDist_gat_filt_sp_mat meta_preferredSample = metaSelected %>% filter(PreferredSample) metaSelected_hrp3_pat1 = meta_preferredSample[match(rownames(jacardDist_gat_filt_sp_mat), meta_preferredSample$BiologicalSample), ]%>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% select(BiologicalSample, Chr11DupHapCluster)) metaSelected_hrp3_pat1 = metaSelected_hrp3_pat1 %>% mutate(PerfectChr11Copy = BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample) rownames(jacardDist_gat_filt_sp_mat_noLabs) = NULL colnames(jacardDist_gat_filt_sp_mat_noLabs) = NULL RowLabs = metaSelected_hrp3_pat1$BiologicalSample RowLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = "" ColLabs = metaSelected_hrp3_pat1$BiologicalSample ColLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = "" #RowLabs[metaSelected$country != "Ethiopia"] = "" rownames(jacardDist_gat_filt_sp_mat_noLabs) = RowLabs colnames(jacardDist_gat_filt_sp_mat_noLabs) = ColLabs rowAnnoDf = metaSelected_hrp3_pat1[,c("hrpCall", "PerfectChr11Copy", "country", "region", "secondaryRegion", "Chr11DupHapCluster")] %>% rename(continent = secondaryRegion) %>% as.data.frame() annotationTextSize = 25 ;annotationTitleTextSize = 20; rowAnnoColors[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors topAnno = HeatmapAnnotation( df = rowAnnoDf, col = rowAnnoColors, annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), show_legend = F, gp = gpar(col = "grey10") ) sideAnno = rowAnnotation( df = rowAnnoDf, annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), col = rowAnnoColors, gp = gpar(col = "grey10") ) haptype_hrp3_pat1HeatMap = Heatmap( jacardDist_gat_filt_sp_mat_noLabs, cluster_columns = T, col = col_fun, name = "JacardIndex", top_annotation = topAnno, left_annotation = sideAnno, row_dend_width = unit(5, "cm"), column_dend_height = unit(5, "cm"), heatmap_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar( fontsize = annotationTextSize, fontface = "bold", title = "JacardIndex" ) ) ) ``` ### Samples with a duplicated chromosome 11 and deleted chr 13 (13-11++ of HRP3 deletion) Jacard index of the duplicated region on chromosome 11, jacard of 1 means complete agreement between samples on this region which 0 would be no haplotypes shared in this region. Additional meta data of the samples is shown on top and to the right including country/region, and the hrp2/3 calls, whether the the Chr11 that has been duplicated is a perfect copy or not. It appears the African samples and South American samples, while related within continent, are not very closely related to each other. ```{r} #| fig-column: screen #| fig-width: 25 #| fig-height: 15 draw(haptype_hrp3_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") ``` ```{r} pdf("haptype_hrp3_pat1.pdf", useDingbats = F, width = 25, height = 20) draw(haptype_hrp3_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") dev.off() ``` #### Similar samples to 13-11++ Below will get the samples that have a chromosome 11 that is similar to the 13-11++ samples (Deleted hrp3, duplicated sub-telomeric chr11 segment). These new similar samples will be regardless of hrp2/3 deletion status. This will show if the chr11 that has been duplicated is circulating in the general population or is only associated with the samples with HRP3 deletion and chr11 duplication. ```{r} metaByBioSample_fieldOrIsolate = metaByBioSample %>% filter(IsFieldSample | "LabIsolate" == site) jacardDist_gat_filt_forOtherSimilarToPat1 = jacardDist_gat %>% filter(sample %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample | otherSample %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample ) %>% filter(sample %in% metaByBioSample_fieldOrIsolate$sample, otherSample %in% metaByBioSample_fieldOrIsolate$sample) %>% filter(sample %in% samplesCovered$BiologicalSample, otherSample %in% samplesCovered$BiologicalSample) %>% # filter(index > 0.99) filter(index > 0.98) simSamples = c(unique(c(jacardDist_gat_filt_forOtherSimilarToPat1$sample, jacardDist_gat_filt_forOtherSimilarToPat1$otherSample, allDeletionTypeMeta_hrp3_pat1$BiologicalSample))) simSamples = simSamples[simSamples != "FCR3"] jacardDist_gat_filt_simToPat1 = jacardDist_gat %>% filter(sample %in% simSamples, otherSample %in% simSamples) %>% mutate(index = ifelse(is.nan(index), 0, index)) jacardDist_gat_filt_simToPat1_sp = jacardDist_gat_filt_simToPat1 %>% spread(otherSample, index) jacardDist_gat_filt_simToPat1_sp_mat = as.matrix(jacardDist_gat_filt_simToPat1_sp[,2:ncol(jacardDist_gat_filt_simToPat1_sp)]) rownames(jacardDist_gat_filt_simToPat1_sp_mat) = jacardDist_gat_filt_simToPat1_sp$sample ``` ```{r} library(circlize) #['#b2182b','#d6604d','#f4a582','#fddbc7','#f7f7f7','#d1e5f0','#92c5de','#4393c3','#2166ac'] col_fun = colorRamp2(c(min(jacardDist_gat_filt_simToPat1_sp_mat), min(jacardDist_gat_filt_simToPat1_sp_mat) + (1-min(jacardDist_gat_filt_simToPat1_sp_mat))/2, 1), c( "#2166ac", "white", "#b2182b")) jacardDist_gat_filt_simToPat1_sp_mat_noLabs = jacardDist_gat_filt_simToPat1_sp_mat meta_preferredSample = meta %>% filter(PreferredSample) metaSelected_hrp3_pat1 = meta_preferredSample[match(rownames(jacardDist_gat_filt_simToPat1_sp_mat), meta_preferredSample$BiologicalSample), ] %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% select(BiologicalSample, Chr11DupHapCluster)) sample_metadata_withAllDeletionCalls = readr::read_tsv("sample_metadata_withAllDeletionCalls.tsv") metaSelected_hrp3_pat1 = metaSelected_hrp3_pat1 %>% left_join(sample_metadata_withAllDeletionCalls %>% rename(BiologicalSample = sample) %>% select(BiologicalSample, Pattern)) %>% mutate(PerfectChr11Copy = case_when( BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample ~ T, BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum$s_Sample ~ F, T ~ NA )) # %>% # mutate(hrpCall = ifelse(BiologicalSample %in% previousDeletionCalls$BiologicalSample, hrpCall, "unknown")) rownames(jacardDist_gat_filt_simToPat1_sp_mat_noLabs) = NULL colnames(jacardDist_gat_filt_simToPat1_sp_mat_noLabs) = NULL RowLabs = metaSelected_hrp3_pat1$BiologicalSample RowLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = "" ColLabs = metaSelected_hrp3_pat1$BiologicalSample ColLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = "" #RowLabs[metaSelected$country != "Ethiopia"] = "" rownames(jacardDist_gat_filt_simToPat1_sp_mat_noLabs) = RowLabs colnames(jacardDist_gat_filt_simToPat1_sp_mat_noLabs) = ColLabs rowAnnoDf = metaSelected_hrp3_pat1[,c("Pattern", "hrpCall", "PerfectChr11Copy", "country", "region", "secondaryRegion", "Chr11DupHapCluster")] %>% rename(continent = secondaryRegion) %>% as.data.frame() temp_rowAnnoColors = createColorListFromDf(rowAnnoDf) temp_rowAnnoColors[["hrpCall"]] = pfhrpsCallColors temp_rowAnnoColors[["continent"]] = continentColors temp_rowAnnoColors[["region"]] = rowAnnoColors$region temp_rowAnnoColors[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern temp_rowAnnoColors[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors annotationTextSize = 25 ;annotationTitleTextSize = 20; topAnno = HeatmapAnnotation( df = rowAnnoDf, col = temp_rowAnnoColors, show_legend = F, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = c("#99999900") ) sideAnno = rowAnnotation( df = rowAnnoDf, col = temp_rowAnnoColors, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = c("#99999900") ) haptype_simTo_hrp3_pat1HeatMap = Heatmap( jacardDist_gat_filt_simToPat1_sp_mat_noLabs, cluster_columns = T, col = col_fun, name = "JacardIndex", top_annotation = topAnno, left_annotation = sideAnno, row_dend_width = unit(5, "cm"), column_dend_height = unit(5, "cm"), heatmap_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar( fontsize = annotationTextSize, fontface = "bold", title = "JacardIndex" ) ) ) ``` Below will get the samples that have a chromosome 11 that is similar to the 13-11++ samples. These new similar samples will be regardless of hrp2/3 deletion status. This will show if the chr11 that has been duplicated is circulating in the general population or is only associated with the samples with HRP3 deletion and chr11 duplication. It appears that the duplicated chromosome 11 is circulating fairly commonly among South American samples that don't have HRP3 deletion while there doesn't appear to be any of the duplicated chr11 circulating in the African population (though could be a high diversity vs low diversity bias and/or sampling biases given the drastic differences in malaria dynamics in the two continents). ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 35 draw(haptype_simTo_hrp3_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") ``` ```{r} pdf("haptype_simTo_hrp3_pat1.pdf", useDingbats = F, width = 30, height = 35) draw(haptype_simTo_hrp3_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") dev.off() ``` ## Plotting haplotypes typed per genomic region Plotting out the variation at the duplicated region, coloring haplotypes by their abundance rank, this visualization will allow interpretation of how similar these haplotypes are here and what the copy looks like within sample (e.g. perfect copy vs variation and how much variation ) ### All samples with 13-11++ HRP3 deletion ```{r} popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11, minPopSize = 1) # select just the major haplotypes and cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep %>% group_by(p_name) %>% mutate(sampleCount = length(unique(s_Sample))) %>% group_by() %>% filter(sampleCount >= 0.99*max(sampleCount)) %>% group_by(s_Sample, p_name) %>% #filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp$s_Sample popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_mat) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_dist) jacardDist_gat_filt_sp_mat_pat1_hc = hclust(dist(jacardDist_gat_filt_sp_mat_pat1)) #rename the levels so they are in the order of the clustering popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_dist_hclust$order])) %>% # levels = rownames(jacardDist_gat_filt_sp_mat_pat1)[jacardDist_gat_filt_sp_mat_pat1_hc$order])) %>% mutate(popid = ifelse(maxPopid == 1, -1, popid)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name)), labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name), expand = c(0,0))+ scale_y_continuous(expand = c(0,0)) meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep = meta_preferredSample %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$s_Sample) %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% select(BiologicalSample, Chr11DupHapCluster)) %>% mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$s_Sample))) allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])} previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot)$data[[1]][["fill"]]) names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$popid)) previousColors["-1"] = "grey0"; allColors = c(allColors, previousColors) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_sp_dist_hclust$order])) %>% mutate(popid= factor(popid)) ``` ```{r} popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1 = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry$p_name))), labels = c("Chr11DupHapCluster", "continent", "region", "country", # levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry$p_name), rep("", length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry$p_name))) ), expand = c(0,0)) + scale_y_continuous( expand = c(0, 0), breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry$s_Sample)), labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry$s_Sample) ) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1 + scale_fill_manual("Microhaplotype\nRank", values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))])) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= 0, xmax = -4.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep) + scale_fill_manual("country", values = rowAnnoColors[["country"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -5, xmax = -9.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep) + scale_fill_manual("region", values = rowAnnoColors[["region"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -10, xmax = -14.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep)+ scale_fill_manual("Continent", values = rowAnnoColors[["continent"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -15, xmax = -19.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = factor(Chr11DupHapCluster)), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep)+ scale_fill_manual("Chr11DupHapCluster", values = newHaploGroupWithSingletColors, labels = names(newHaploGroupWithSingletColors), breaks = names(newHaploGroupWithSingletColors)) + guides(fill = guide_legend(nrow = 4)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype. Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal (and in this instance would mean the copy on chr11 and chr13 is not a perfect copy). ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 20 regions_afterHomologous_chr11_filt = regions_afterHomologous_chr11 %>% filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name) %>% mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name))) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1 + new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -5, fill = description), data = regions_afterHomologous_chr11_filt, color = "black") + scale_fill_manual("Genes\nDescription", values = descriptionColors, guide = guide_legend(nrow = 5)) + transparentBackground + theme(legend.text = element_text(size = 30), legend.title = element_text(size = 30, face = "bold"), legend.box="vertical", legend.margin=margin(), legend.background = element_blank(), legend.box.background = element_rect(colour = "black"), axis.text.x = element_text(size = 30)) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1) ``` ```{r} pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot.pdf", useDingbats = F, width = 40, height = 30) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod1) dev.off() ``` ```{r} popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11, minPopSize = 2) # select just the major haplotypes and cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 %>% group_by(p_name) %>% mutate(sampleCount = length(unique(s_Sample))) %>% group_by() %>% filter(sampleCount >= 0.99*max(sampleCount)) %>% group_by(s_Sample, p_name) %>% #filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp$s_Sample popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_mat) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_dist) #rename the levels so they are in the order of the clustering popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_dist_hclust$order])) %>% mutate(popid = ifelse(maxPopid == 1, -1, popid)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$p_name)), labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$p_name), expand = c(0,0)) meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 = meta_preferredSample %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$s_Sample) %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% select(BiologicalSample, Chr11DupHapCluster)) %>% mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$s_Sample))) allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])} previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_plot)$data[[1]][["fill"]]) names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$popid)) previousColors["-1"] = "grey0"; allColors = c(allColors, previousColors) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3 %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_sp_dist_hclust$order])) %>% mutate(popid= factor(popid)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(limits = c(-30, max(c(-9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$p_name))))), breaks = c(-9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$p_name))), labels = c("HaploGroup", "continent", levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$p_name)), expand = c(0,0)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 + scale_fill_manual("Microhaplotype\nRank", values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))])) + guides(fill = guide_legend(nrow = 4)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= 0, xmax = -4.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3)+ scale_fill_manual("Continent", values = rowAnnoColors[["continent"]]) + guides(fill = guide_legend(nrow = 4)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -5, xmax = -9.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = factor(Chr11DupHapCluster)), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep)+ scale_fill_manual("Chr11DupHapCluster", values = newHaploGroupWithSingletColors, labels = names(newHaploGroupWithSingletColors), breaks = names(newHaploGroupWithSingletColors)) + guides(fill = guide_legend(nrow = 4)) regions_afterHomologous_chr11_filt = regions_afterHomologous_chr11 %>% filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$p_name) %>% mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3$p_name))) yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$s_Sample) yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3[yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 %!in% c("HB3", "Santa-Lucia-Salvador-I", "SD01")] = "" popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 + scale_y_continuous(labels = yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3, breaks = 1:length(yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3), expand = c(0,0)) + theme(axis.text.x = element_blank(), axis.line.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank(), axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), ) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3_priorToGeneInfo = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3 + new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -7, fill = description), data = regions_afterHomologous_chr11_filt, color = "black") + geom_text( aes(y = as.numeric(BiologicalSample), x = -10, label = BiologicalSample), hjust = 1, #data = tibble(BiologicalSample = factor(c("HB3", "Santa-Lucia-Salvador-I", "SD01"), levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$s_Sample))) data = tibble(BiologicalSample = factor(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$s_Sample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod3_withCountry$s_Sample))) ) + scale_fill_manual("Genes\nDescription", values = descriptionColors, guide = guide_legend(nrow = 4)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype. Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal (and in this instance would mean the copy on chr11 and chr13 is not a perfect copy). ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 20 print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3) ``` ##### Collapsing parasites by same haplotypes ```{r} pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot_mod3_onlyVariableSites.pdf", useDingbats = F, width = 25, height = 20) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3) dev.off() pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot_mod3_onlyVariableSites_noGeneInfo.pdf", useDingbats = F, width = 15, height = 12.5) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod3_priorToGeneInfo) dev.off() ``` ```{r} popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% group_by(s_Sample) %>% summarise(p_name_count = length(unique(p_name)), p_name_meanCOI = mean(uniqHaps)) %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% rename(s_Sample = BiologicalSample)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness %>% #filter(s_Sample %!in% c("HB3", "QV0040-C", "IGS-CBD-010")) %>% #filter(hcclustSize > 2, newClusterName != 9) %>% #filter(hcclustSize > 1, newClusterName != 9) %>% filter(hcclustSize > 1) %>% arrange(desc(p_name_count), p_name_meanCOI) %>% group_by(newClusterName) %>% mutate(groupID = row_number()) %>% filter(groupID == 1) %>% left_join(meta_preferredSample %>% select(BiologicalSample, secondaryRegion) %>% rename(s_Sample = BiologicalSample)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt %>% mutate(secondaryRegion = factor(secondaryRegion, levels = c("S_AMERICA", "AFRICA", "ASIA"))) %>% arrange(secondaryRegion, desc(hcclustSize)) %>% mutate(s_Sample = factor(s_Sample, levels = .$s_Sample)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% filter(s_Sample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt$s_Sample), minPopSize = 2) # select just the major haplotypes and cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 %>% group_by(p_name) %>% mutate(sampleCount = length(unique(s_Sample))) %>% group_by() %>% filter(sampleCount >= 0.99*max(sampleCount)) %>% group_by(s_Sample, p_name) %>% # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp$s_Sample popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_mat) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist) jacardDist_gat_filt_sp_mat_pat1_hc = hclust(dist(jacardDist_gat_filt_sp_mat_pat1)) #rename the levels so they are in the order of the clustering popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 %>% mutate(s_Sample = factor(s_Sample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt$s_Sample))) %>% # levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust$order])) %>% # levels = rownames(jacardDist_gat_filt_sp_mat_pat1)[jacardDist_gat_filt_sp_mat_pat1_hc$order])) %>% mutate(popid = ifelse(maxPopid == 1, -1, popid)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$p_name)), labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$p_name), expand = c(0,0)) meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 = meta_preferredSample %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$s_Sample) %>% mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$s_Sample))) allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])} previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_plot)$data[[1]][["fill"]]) names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$popid)) previousColors["-1"] = "grey0"; allColors = c(allColors, previousColors) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4 %>% mutate(s_Sample = factor(s_Sample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt$s_Sample))) %>% # mutate(s_Sample = factor(s_Sample, # levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust$order])) %>% mutate(popid= factor(popid)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(limits = c(-30, max(c(-9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry$p_name))))), breaks = c(-9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry$p_name))), labels = c("HaploGroup", "continent", levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry$p_name)), expand = c(0,0)) k_groups = nrow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_regionCompletionness_filt); h_groups = 1.1; popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups = cutree(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust, k = k_groups) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend <- as.dendrogram(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend <- color_labels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend, k = k_groups) plot(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups = cutree(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust, h = h_groups) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend <- as.dendrogram(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend <- color_labels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend, h = h_groups) plot(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_dend) jacardDist_gat_filt_sp_mat_pat1_hc_groups = cutree(jacardDist_gat_filt_sp_mat_pat1_hc, k = k_groups) jacardDist_gat_filt_sp_mat_pat1_hc_dend <- as.dendrogram(jacardDist_gat_filt_sp_mat_pat1_hc) jacardDist_gat_filt_sp_mat_pat1_hc_dend <- color_labels(jacardDist_gat_filt_sp_mat_pat1_hc_dend, k = k_groups) plot(jacardDist_gat_filt_sp_mat_pat1_hc_dend) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df = tibble( BiologicalSample = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups), hcclust = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups ) %>% mutate(BiologicalSample =factor(BiologicalSample, levels = levels(meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$BiologicalSample))) %>% group_by(hcclust) %>% mutate(hcclustSize = n()) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_biggerGroups = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>% filter(hcclustSize != 1) %>% select(hcclust) %>% unique() popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_singletonGroups = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>% filter(hcclustSize == 1) %>% select(hcclust) %>% unique() nonSingletonGroupsColors = scheme$hex(nrow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>% select(hcclust) %>% unique())) names(nonSingletonGroupsColors) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_biggerGroups$hcclust nonSingletonGroupsColors_singleton = scheme$hex(nrow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>% select(hcclust) %>% unique())) names(nonSingletonGroupsColors_singleton) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_singletonGroups$hcclust haploGroupColors = c(nonSingletonGroupsColors, nonSingletonGroupsColors_singleton) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_groupCounts = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>% select(hcclust, hcclustSize) %>% ungroup() %>% unique() %>% arrange(desc(hcclustSize)) %>% mutate(hcclust = as.character(hcclust),newClusterName = row_number()) %>% left_join(tibble( hcclust = names(haploGroupColors), colors = unname(haploGroupColors) )) newHaploGroupColors = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_groupCounts$colors names(newHaploGroupColors)= popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_groupCounts$newClusterName popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df %>% left_join( popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df_groupCounts %>% mutate(hcclust = as.integer(hcclust)) ) %>% left_join( popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% select(BiologicalSample, hcclustSize) %>% rename(originalGroupSize = hcclustSize) ) %>% mutate(BiologicalSample = factor(BiologicalSample, levels = levels(meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$BiologicalSample))) # jacardDist_gat_filt_sp_mat_pat1_hc_groups_df = tibble( # BiologicalSample = names(jacardDist_gat_filt_sp_mat_pat1_hc_groups), # hcclust = jacardDist_gat_filt_sp_mat_pat1_hc_groups # ) %>% # mutate(BiologicalSample =factor(BiologicalSample, levels = levels(meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$BiologicalSample))) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 + scale_fill_manual("Microhaplotype\nRank", values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))]) ) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= 0, xmax = -4.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.3, fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4)+ scale_fill_manual("Continent", values = rowAnnoColors[["continent"]]) + guides(fill = guide_legend(nrow = 4)) + # ggnewscale::new_scale_fill() + # geom_rect(aes(xmin= -5, xmax = -9.5, # ymin = as.numeric(BiologicalSample) - 0.5, # ymax = as.numeric(BiologicalSample) + 0.3, # fill = factor(newClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df)+ # # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ # # scale_fill_manual("HaploGroup", values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups)))) + # #scale_fill_manual("Chr11DupHapCluster", values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors)) + # scale_fill_manual("Chr11DupHapCluster", values = newHaploGroupColors, labels = names(newHaploGroupColors), # breaks = names(newHaploGroupColors)) + geom_text(aes( x = -9.5, y = as.numeric(BiologicalSample) - 0.5 + 0.4, label = paste0("n=", originalGroupSize) ), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_sp_dist_hclust_groups_df)+ guides(fill = guide_legend(nrow = 4)) regions_afterHomologous_chr11_filt = regions_afterHomologous_chr11 %>% filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$p_name) %>% mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4$p_name))) yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry$s_Sample) # yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4[yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 %!in% c("HB3", "Santa-Lucia-Salvador-I", "SD01")] = "" popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 + scale_y_continuous(labels = yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4, breaks = 1:length(yLabels_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4), expand = c(0,0)) + theme(axis.text.x = element_blank(), axis.line.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank(), axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), legend.background = element_blank(), legend.box.background = element_rect(colour = "black", linewidth = 1) ) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4_priorToGeneInfo = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4 + new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -1, fill = description), data = regions_afterHomologous_chr11_filt, color = "black") + geom_text( aes(y = as.numeric(BiologicalSample), x = -10, label = BiologicalSample), hjust = 1, data = tibble(BiologicalSample = factor(c("HB3", "Santa-Lucia-Salvador-I", "SD01"), levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_mod4_withCountry$s_Sample))) ) + scale_fill_manual("Genes\nDescription", values = descriptionColors, guide = guide_legend(nrow = 4)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype. Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal (and in this instance would mean the copy on chr11 and chr13 is not a perfect copy). ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 20 print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4) ``` ```{r} pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot_mod4_onlyVariableSites.pdf", useDingbats = F, width = 25, height = 15) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4) dev.off() pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_plot_mod4_onlyVariableSites_noGeneInfo.pdf", useDingbats = F, width = 15, height = 6) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_withCountry_plot_mod4_priorToGeneInfo) dev.off() ``` #### Perfect copies ```{r} popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% filter(s_Sample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample) , minPopSize = 1) # select just the major haplotypes and cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies %>% group_by(p_name) %>% mutate(sampleCount = length(unique(s_Sample))) %>% group_by() %>% filter(sampleCount > 0.9*max(sampleCount)) %>% group_by(s_Sample, p_name) %>% # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp$s_Sample popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_mat) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_dist) #rename the levels so they are in the order of the clustering popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_dist_hclust$order]))%>% mutate(popid = ifelse(maxPopid == 1, -1, popid)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$p_name)), labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$p_name), expand = c(0,0)) meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies = meta_preferredSample %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$s_Sample) %>% mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$s_Sample))) allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])} previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_plot)$data[[1]][["fill"]]) names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$popid)) previousColors["-1"] = "grey0"; allColors = c(allColors, previousColors) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_sp_dist_hclust$order])) %>% mutate(popid= factor(popid)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry$p_name))), labels = c("Chr11DupHapCluster", "continent", "region", "country", rep("", length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry$p_name))) # levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry$p_name) ), expand = c(0,0)) + scale_y_continuous( expand = c(0, 0), breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry$s_Sample)), labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry$s_Sample) ) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$s_Sample) %>% mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies$s_Sample))) %>% mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0"))) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies %>% select(Chr11DupHapClusterName, colors) %>% unique() %>% arrange(Chr11DupHapClusterName) # popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColorsDf$colors names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot + scale_fill_manual("Microhaplotype\nRank", values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))])) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= 0, xmax = -4.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies) + scale_fill_manual("country", values = rowAnnoColors[["country"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -5, xmax = -9.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies) + scale_fill_manual("region", values = rowAnnoColors[["region"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -10, xmax = -14.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies)+ scale_fill_manual("Continent", values = rowAnnoColors[["continent"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -15, xmax = -19.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies)+ # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ # scale_fill_manual("HaploGroup", values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups)))) + #scale_fill_manual("Chr11DupHapCluster", values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors)) + scale_fill_manual("Chr11DupHapCluster", values = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColors), breaks = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_Chr11DupHapClusterColors)) + guides(fill = guide_legend(nrow = 4)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal (and in this instance would mean the copy on chr11 and chr13 is not a perfect copy) ```{r} #| fig-column: screen #| fig-width: 40 #| fig-height: 35 regions_afterHomologous_chr11_filt = regions_afterHomologous_chr11 %>% filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name) %>% mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name))) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot + new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -5, fill = description), data = regions_afterHomologous_chr11_filt, color = "black") + scale_fill_manual("Genes\nDescription", values = descriptionColors, guide = guide_legend(nrow = 5)) + transparentBackground + theme(legend.text = element_text(size = 30), legend.title = element_text(size = 30, face = "bold"), legend.box="vertical", legend.margin=margin(), legend.background = element_blank(), legend.box.background = element_rect(colour = "black"), axis.text.x = element_text(size = 30)) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot) ``` ```{r} pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_plot.pdf", useDingbats = F, width = 40, height = 35) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_closeToPerfectCopies_withCountry_plot + labs(title = "Perfect Copies")) dev.off() ``` #### Divergent copies ```{r} popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% filter(s_Sample %!in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample) , minPopSize = 1) # select just the major haplotypes and cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies %>% group_by() %>% filter(samp_n > 0.9*max(samp_n)) %>% group_by(s_Sample, p_name) %>% #filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp$s_Sample popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_dist) nameOrderFromforClustering = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust$order] orderForDivergentCopy = nameOrderFromforClustering[nameOrderFromforClustering %in% rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat)] #rename the levels so they are in the order of the clustering popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies %>% mutate(s_Sample = factor(s_Sample, # levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_dist_hclust$order]))%>% levels = orderForDivergentCopy)) %>% mutate(popid = ifelse(maxPopid == 1, -1, popid)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$p_name)), labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$p_name), expand = c(0,0)) meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies = meta_preferredSample %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$s_Sample) %>% mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$s_Sample))) allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])} previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_plot)$data[[1]][["fill"]]) names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$popid)) previousColors["-1"] = "grey0"; allColors = c(allColors, previousColors) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies %>% mutate(s_Sample = factor(s_Sample, # levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_sp_dist_hclust$order])) %>% levels = orderForDivergentCopy)) %>% mutate(popid= factor(popid)) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$s_Sample) %>% mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies$s_Sample))) %>% mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0"))) %>% arrange(BiologicalSample) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies %>% select(Chr11DupHapClusterName, colors) %>% unique() %>% arrange(Chr11DupHapClusterName) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColorsDf$colors names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry$p_name))), labels = c("Chr11DupHapCluster", "continent", "region", "country", rep("", length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry$p_name))) # levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry$p_name) ), expand = c(0,0))+ scale_y_continuous( expand = c(0, 0), breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry$s_Sample)), labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry$s_Sample) ) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot+ scale_fill_manual("Microhaplotype\nRank", values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))])) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= 0, xmax = -4.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies) + scale_fill_manual("country", values = rowAnnoColors[["country"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -5, xmax = -9.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies) + scale_fill_manual("region", values = rowAnnoColors[["region"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -10, xmax = -14.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies)+ scale_fill_manual("Continent", values = rowAnnoColors[["continent"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -15, xmax = -19.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies)+ # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ # scale_fill_manual("HaploGroup", values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups)))) + #scale_fill_manual("Chr11DupHapCluster", values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors)) + scale_fill_manual("Chr11DupHapCluster", values = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColors), breaks = names(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_Chr11DupHapClusterColors)) + guides(fill = guide_legend(nrow = 4)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal (and in this instance would mean the copy on chr11 and chr13 is not a perfect copy) ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 20 regions_afterHomologous_chr11_filt = regions_afterHomologous_chr11 %>% filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name) %>% mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep$p_name))) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot + new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -1, fill = description), data = regions_afterHomologous_chr11_filt, color = "black") + scale_fill_manual("Genes\nDescription", values = descriptionColors, guide = guide_legend(nrow = 5)) + transparentBackground + theme(legend.text = element_text(size = 30), legend.title = element_text(size = 30, face = "bold"), legend.box="vertical", legend.margin=margin(), legend.background = element_blank(), legend.box.background = element_rect(colour = "black"), axis.text.x = element_text(size = 30)) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot) ``` ```{r} pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_plot.pdf", useDingbats = F, width = 40, height = 30) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_divergentCopies_withCountry_plot + labs(title = "Divergent Copies")) dev.off() ``` ### Sub set ### SD01, HB3, Santa-Lucia-Salvador-I ```{r} popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11 %>% filter(s_Sample %in% c("HB3", "SD01", "Santa-Lucia-Salvador-I")) , minPopSize = 1) # select just the major haplotypes and cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates %>% group_by(p_name) %>% mutate(sampleCount = length(unique(s_Sample)))%>% group_by() %>% filter(sampleCount > 0.9*max(sampleCount)) %>% group_by(s_Sample, p_name) %>% # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_mat) = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp$s_Sample popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_mat) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_dist) #rename the levels so they are in the order of the clustering popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_mat)[popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_sp_dist_hclust$order])) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name)), # labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name), labels = rep("", length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name))), expand = c(0,0)) previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot)$data[[1]][["fill"]]) names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$popid)) previousColors["-1"] = "grey0"; popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates%>% mutate(popid= factor(popid)), colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name)), # labels = levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name), labels = rep("", length(levels(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates$p_name))), expand = c(0,0)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal (and in this instance would mean the copy on chr11 and chr13 is not a perfect copy) It appears that SD01 and Santa-Lucia-Salvador-I have perfect copies of chr11 on chr11 and chr13 while HB3 has a divergent copy (which is confirmed with the nanopore assembly) Interestingly enough, the Santa-Lucia-Salvador-I chr11 duplicated region appears to be one of the chr11 in HB3. ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 5 popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot + scale_fill_manual("Microhaplotype\nRank", values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))])) + guides(fill = guide_legend(nrow = 1)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -1, fill = description), data = regions_afterHomologous_chr11_filt, color = "black") + scale_fill_manual("Genes\nDescription", values = descriptionColors, guide = guide_legend(nrow = 5)) + transparentBackground + theme(legend.text = element_text(size = 30), legend.title = element_text(size = 30, face = "bold"), legend.box="vertical", legend.margin=margin(), legend.background = element_blank(), legend.box.background = element_rect(colour = "black"), axis.text.x = element_text(size = 30), axis.text.y = element_text(size = 30)) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot) ``` ```{r} pdf("popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot.pdf", useDingbats = F, width = 40, height = 7.5) print(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_LabIsolates_plot) dev.off() ``` # Shared Region between chr11 and chr13 The data on the 15.2kb duplicated region between chromosome 11 and 13. ```{r} excludeRegions = c("Pf3D7_11_v3-1919633-1920323-for__var-6", "Pf3D7_11_v3-1920483-1921173-for__var-3", "Pf3D7_11_v3-1920483-1921173-for__var-4", "Pf3D7_11_v3-1920483-1921173-for__var-5", "Pf3D7_11_v3-1920483-1921173-for__var-6", "Pf3D7_11_v3-1920483-1921173-for__var-7", "Pf3D7_11_v3-1928369-1928869-for__var-3", "Pf3D7_11_v3-1928619-1929119-for__var-3") regions_homologousRegion = regions %>% filter("shared" == homologousRegion) %>% filter(`#chrom` == "Pf3D7_11_v3") %>% filter(name %!in% excludeRegions) popClustering_filt_hrp3_pat1_regions_homologousRegion = popClustering_filt_hrp3_pat1 %>% filter(p_name %in% regions_homologousRegion$genomicID) popClustering_filt_hrp3_pat1_regions_homologousRegion = popClustering_filt_hrp3_pat1_regions_homologousRegion %>% group_by(s_Sample, p_name) %>% mutate(uniqHaps= n()) popClustering_filt_hrp3_pat1_regions_homologousRegion_uniqSum = popClustering_filt_hrp3_pat1_regions_homologousRegion %>% group_by(s_Sample) %>% mutate(targets = length(unique(genomicID))) %>% group_by(s_Sample, targets, uniqHaps) %>% count() %>% mutate(freq = n/targets) popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum = popClustering_filt_hrp3_pat1_regions_homologousRegion %>% mutate(marker = uniqHaps == 1) %>% group_by(s_Sample) %>% summarise(conserved = sum(marker), targets = length(unique(genomicID))) %>% mutate(conservedID = conserved/targets) conservedCutOff = 0.99 popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_closeToPerfectCopies = popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum %>% filter(conservedID > conservedCutOff) popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOff = popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum %>% mutate(marker = conservedID > conservedCutOff) %>% group_by() %>% summarise(perfectDuplication = sum(marker), totalSamps = length(unique(s_Sample))) %>% mutate(perfectCopyFreq = perfectDuplication/totalSamps) popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOffByRegion = popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum %>% mutate(marker = conservedID > conservedCutOff) %>% left_join(metaByBioSample %>% rename(s_Sample = sample)) %>% group_by(region) %>% summarise(perfectDuplication = sum(marker), totalSamps = length(unique(s_Sample))) %>% mutate(perfectCopyFreq = perfectDuplication/totalSamps) popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOffByContinent = popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum %>% mutate(marker = conservedID > conservedCutOff) %>% left_join(metaByBioSample %>% rename(s_Sample = sample)) %>% group_by(secondaryRegion) %>% summarise(perfectDuplication = sum(marker), totalSamps = length(unique(s_Sample))) %>% mutate(perfectCopyFreq = perfectDuplication/totalSamps) ``` The number of samples with perfect copies ```{r} create_dt(popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOff) ``` The number of samples with perfect copies broken down by regions ```{r} create_dt(popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOffByRegion) ``` The number of samples with perfect copies broken down by continent. ```{r} create_dt(popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_cutOffByContinent) popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_meanId = popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum %>% filter(conservedID <= conservedCutOff) %>% summarise(meanID = mean(conservedID), minID = min(conservedID), sdID = sd(conservedID)) ``` The breakdown of level of divergence in the samples with divergent samples. ```{r} create_dt(popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_meanId) ``` ## Population analysis of chr11/chr13 shared region Calculating the population of the haplotypes of the shared region on chr 11/chr13 ```{r} popClustering_filt_regions_homologousRegion = popClustering %>% filter(genomicID %!in% erroneousRegions) %>% filter(p_name %in% regions_homologousRegion$genomicID) popClustering_filt_regions_homologousRegion_tarCounts = popClustering_filt_regions_homologousRegion %>% group_by(s_Sample) %>% summarise(tarCounts = length(unique(p_name))) popClustering_filt_regions_homologousRegion_tarCounts_filt = popClustering_filt_regions_homologousRegion_tarCounts %>% filter(tarCounts >= 0.80 * max(tarCounts) | s_Sample %in% previousDeletionCalls$BiologicalSample) popClustering_filt_regions_homologousRegion_sampCounts = popClustering_filt_regions_homologousRegion %>% group_by(p_name) %>% summarise(sampCounts = length(unique(s_Sample))) write_tsv(popClustering_filt_regions_homologousRegion %>% filter(s_Sample %in% popClustering_filt_regions_homologousRegion_tarCounts_filt$s_Sample) %>% group_by() %>% select(s_Sample, p_name, h_popUID, c_AveragedFrac), "popClustering_filt_regions_homologousRegion.tab.txt.gz") ``` ```{bash, eval = F} elucidator doPairwiseComparisonOnHapsSharingDev --tableFnp popClustering_filt_regions_homologousRegion.tab.txt.gz --sampleCol s_Sample --targetNameCol p_name --popIDCol h_popUID --relAbundCol c_AveragedFrac --numThreads 14 --dout pairwiseComps_regions_homologousRegion --verbose --overWriteDir --metaFnp metaByBioSample_outwithHrpCalls.tab.txt --metaFieldsToCalcPopDiffs country,region,secondaryRegion,HRP3_deletionPattern --writeOutDistMatrices ``` ```{r} #jacardDist = readr::read_tsv("pairwiseComps_regions_homologousRegion/percOfTarSharingAtLeastOneHap.tab.txt.gz", col_names = F) jacardDist_homologousRegion = readr::read_tsv("pairwiseComps_regions_homologousRegion/jacardByHapsTarShared.tab.txt.gz", col_names = F) jacardDist_homologousRegionSamps = readr::read_tsv("pairwiseComps_regions_homologousRegion/sampleNames.tab.txt", col_names = "samples") colnames(jacardDist_homologousRegion) = jacardDist_homologousRegionSamps$samples jacardDist_homologousRegion$sample = jacardDist_homologousRegionSamps$samples # jacardDist_homologousRegion_filt = jacardDist_homologousRegion %>% # filter(sample %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample) jacardDist_homologousRegion_filt = jacardDist_homologousRegion[jacardDist_homologousRegion$sample %in% allDeletionTypeMeta_hrp3_pat1$BiologicalSample,c(allDeletionTypeMeta_hrp3_pat1$BiologicalSample, "sample")] jacardDist_homologousRegion_gat = jacardDist_homologousRegion_filt %>% gather(otherSample, index,1:(ncol(.) - 1)) jacardDist_homologousRegion_gat_filt = jacardDist_homologousRegion_gat %>% filter(sample %fin% allDeletionTypeMeta_hrp3_pat1$BiologicalSample, otherSample %fin% allDeletionTypeMeta_hrp3_pat1$BiologicalSample) jacardDist_homologousRegion_gat_filt_sp = jacardDist_homologousRegion_gat_filt %>% spread(otherSample, index) jacardDist_homologousRegion_gat_filt_sp_mat = as.matrix(jacardDist_homologousRegion_gat_filt_sp[,2:ncol(jacardDist_homologousRegion_gat_filt_sp)]) rownames(jacardDist_homologousRegion_gat_filt_sp_mat) = jacardDist_homologousRegion_gat_filt_sp$sample ``` ```{r} library(circlize) #col_fun = colorRamp2(c(0, 0.5, 1), c(heat.colors(3))) col_fun = colorRamp2(c(min(jacardDist_homologousRegion_gat_filt_sp_mat), min(jacardDist_homologousRegion_gat_filt_sp_mat) + (1-min(jacardDist_homologousRegion_gat_filt_sp_mat))/2, 1), c( "#2166ac", "white", "#b2182b")) jacardDist_homologousRegion_gat_filt_sp_mat_noLabs = jacardDist_homologousRegion_gat_filt_sp_mat meta_preferredSample = meta %>% filter(PreferredSample) metaSelected_hrp3_pat1 = meta_preferredSample[match(rownames(jacardDist_homologousRegion_gat_filt_sp_mat), meta_preferredSample$BiologicalSample), ] metaSelected_hrp3_pat1 = metaSelected_hrp3_pat1 %>% mutate(PerfectChr11Copy = BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample) %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% mutate(BiologicalSample = as.character(BiologicalSample)) %>% select(BiologicalSample, newClusterName)) rownames(jacardDist_homologousRegion_gat_filt_sp_mat_noLabs) = NULL colnames(jacardDist_homologousRegion_gat_filt_sp_mat_noLabs) = NULL RowLabs = metaSelected_hrp3_pat1$BiologicalSample RowLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = "" ColLabs = metaSelected_hrp3_pat1$BiologicalSample ColLabs[metaSelected_hrp3_pat1$site != "LabIsolate" | is.na(metaSelected_hrp3_pat1$site)] = "" #RowLabs[metaSelected$country != "Ethiopia"] = "" rownames(jacardDist_homologousRegion_gat_filt_sp_mat_noLabs) = RowLabs colnames(jacardDist_homologousRegion_gat_filt_sp_mat_noLabs) = ColLabs rowAnnoDf = metaSelected_hrp3_pat1[,c("hrpCall", "PerfectChr11Copy", "country", "region", "secondaryRegion", "newClusterName")] %>% rename(continent = secondaryRegion, Chr11DupHapCluster = newClusterName) %>% as.data.frame() annotationTextSize = 25 ;annotationTitleTextSize = 20; rowAnnoColors_mod = rowAnnoColors rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors topAnno = HeatmapAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, show_legend = F, annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), gp = gpar(col = "grey10") ) sideAnno = rowAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), gp = gpar(col = "grey10") ) haptype_hrp3_regions_homologousRegion_pat1HeatMap = Heatmap( jacardDist_homologousRegion_gat_filt_sp_mat_noLabs, cluster_columns = T, col = col_fun, name = "JacardIndex", top_annotation = topAnno, left_annotation = sideAnno, row_dend_width = unit(5, "cm"), column_dend_height = unit(5, "cm"), heatmap_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar( fontsize = annotationTextSize, fontface = "bold", title = "JacardIndex" ) ) ) ``` ```{r} #| fig-column: screen #| fig-width: 25 #| fig-height: 15 draw(haptype_hrp3_regions_homologousRegion_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") ``` ```{r} pdf("haptype_hrp3_regions_homologousRegion_pat1HeatMap.pdf", useDingbats = F, width = 25, height = 20) draw(haptype_hrp3_regions_homologousRegion_pat1HeatMap, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") dev.off() ``` ### Plotting haplotypes Plotting out the variation at the duplicated region, coloring haplotypes by their abundance rank, this visualization will allow interpretation of how similar these haplotypes are here and what the copy looks like within sample (e.g. perfect copy vs variation and how much variation ) ### All ```{r} regions_homologousRegion = regions_homologousRegion %>% mutate(description = case_when( grepl("extraField0=NA", extraField0) ~ "intergenic", T ~ gsub("\\]", "", gsub(".*description=", "", extraField0)) ) ) descriptionColors_homologousRegion = scheme$hex(length(regions_homologousRegion$description %>% unique())) names(descriptionColors_homologousRegion) = regions_homologousRegion$description %>% unique() descriptionColors_homologousRegion["intergenic"] = c("#FF000000") popClustering_filt_hrp3_pat1_regions_homologousRegion_prep = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_homologousRegion, minPopSize = 1) # select just the major haplotypes and cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep %>% group_by(p_name) %>% mutate(sampleCount = length(unique(s_Sample)))%>% group_by() %>% filter(sampleCount > 0.9*max(sampleCount)) %>% group_by(s_Sample, p_name) %>% # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp$s_Sample popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist) #rename the levels so they are in the order of the clustering popClustering_filt_hrp3_pat1_regions_homologousRegion_prep = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist_hclust$order])) %>% mutate(popid = ifelse(maxPopid == 1, -1, popid)) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$p_name)), labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$p_name), expand = c(0,0)) + scale_y_continuous( expand = c(0, 0), breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$s_Sample)), labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$s_Sample) ) meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep = meta_preferredSample %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$s_Sample) %>% mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$s_Sample))) allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])} previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_plot)$data[[1]][["fill"]]) names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep$popid)) previousColors["-1"] = "grey0"; allColors = c(allColors, previousColors) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist_hclust$order])) %>% mutate(popid= factor(popid)) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$p_name))), labels = c("Chr11DupHapCluster", "continent", "region", "country", rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$p_name))) # levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$p_name) ), expand = c(0,0))+ scale_y_continuous( expand = c(0, 0), breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$s_Sample)), labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$s_Sample) ) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$s_Sample) %>% mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$s_Sample))) %>% mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0"))) %>% arrange(BiologicalSample) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry %>% select(Chr11DupHapClusterName, colors) %>% unique() %>% arrange(Chr11DupHapClusterName) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColorsDf$colors names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot + scale_fill_manual("Microhaplotype\nRank", values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))])) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= 0, xmax = -4.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep) + scale_fill_manual("country", values = rowAnnoColors[["country"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -5, xmax = -9.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep) + scale_fill_manual("region", values = rowAnnoColors[["region"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -10, xmax = -14.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep)+ scale_fill_manual("Continent", values = rowAnnoColors[["continent"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -15, xmax = -19.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry)+ # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ # scale_fill_manual("HaploGroup", values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups)))) + #scale_fill_manual("Chr11DupHapCluster", values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors)) + scale_fill_manual("Chr11DupHapCluster", values = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColors), breaks = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_Chr11DupHapClusterColors)) + guides(fill = guide_legend(nrow = 4)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 20 regions_homologousRegion_filt = regions_homologousRegion %>% filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$p_name) %>% mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry$p_name))) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot_final = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot + new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -10, fill = description), data = regions_homologousRegion_filt, color = "black") + scale_fill_manual("Genes\nDescription", values = descriptionColors_homologousRegion, guide = guide_legend(nrow = 2) ) + transparentBackground + theme(legend.text = element_text(size = 30), legend.title = element_text(size = 30, face = "bold"), legend.box="vertical", legend.margin=margin(), legend.background = element_blank(), legend.box.background = element_rect(colour = "black"), axis.text.x = element_text(size = 30)) print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot_final ) ``` ```{r} pdf("popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_plot.pdf",useDingbats = F,width = 30,height = 25) print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_withCountry_plot_final) dev.off() ``` #### Perfect copies ```{r} popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_homologousRegion %>% filter(s_Sample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_closeToPerfectCopies$s_Sample) , minPopSize = 1) # select just the major haplotypes and cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies %>% group_by(p_name) %>% mutate(sampleCount = length(unique(s_Sample)))%>% group_by() %>% filter(sampleCount > 0.9*max(sampleCount)) %>% group_by(s_Sample, p_name) %>% # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_mat) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp$s_Sample popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_mat) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_dist) #rename the levels so they are in the order of the clustering popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_dist_hclust$order]))%>% mutate(popid = ifelse(maxPopid == 1, -1, popid)) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies$p_name)), labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies$p_name), expand = c(0,0))+ scale_y_continuous(expand = c(0,0)) meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies = meta_preferredSample %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies$s_Sample) %>% mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies$s_Sample))) allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])} previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_plot)$data[[1]][["fill"]]) names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies$popid)) previousColors["-1"] = "grey0"; allColors = c(allColors, previousColors) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_sp_dist_hclust$order])) %>% mutate(popid= factor(popid)) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$p_name))), labels = c("Chr11DupHapCluster", "continent", "region", "country", rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$p_name))) # levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$p_name) ), expand = c(0,0))+ scale_y_continuous( expand = c(0, 0), breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$s_Sample)), labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$s_Sample) ) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$s_Sample) %>% mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$s_Sample))) %>% mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0"))) %>% arrange(BiologicalSample) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry %>% select(Chr11DupHapClusterName, colors) %>% unique() %>% arrange(Chr11DupHapClusterName) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColorsDf$colors names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot+ scale_fill_manual("Microhaplotype\nRank", values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))])) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= 0, xmax = -4.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies) + scale_fill_manual("country", values = rowAnnoColors[["country"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -5, xmax = -9.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies) + scale_fill_manual("region", values = rowAnnoColors[["region"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -10, xmax = -14.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies)+ scale_fill_manual("Continent", values = rowAnnoColors[["continent"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -15, xmax = -19.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry)+ # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ # scale_fill_manual("HaploGroup", values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups)))) + #scale_fill_manual("Chr11DupHapCluster", values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors)) + scale_fill_manual("Chr11DupHapCluster", values = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColors), breaks = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_Chr11DupHapClusterColors)) + guides(fill = guide_legend(nrow = 4)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 20 regions_homologousRegion_filt = regions_homologousRegion %>% filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$p_name) %>% mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry$p_name))) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot_final = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot + new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -1, fill = description), data = regions_homologousRegion_filt, color = "black") + scale_fill_manual("Genes\nDescription", values = descriptionColors_homologousRegion, guide = guide_legend(nrow = 2))+ transparentBackground + theme(legend.text = element_text(size = 30), legend.title = element_text(size = 30, face = "bold"), legend.box="vertical", legend.margin=margin(), legend.background = element_blank(), legend.box.background = element_rect(colour = "black"), axis.text.x = element_text(size = 30)) print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot_final) ``` ```{r} pdf("popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_plot.pdf", useDingbats = F, width = 30, height = 25) print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_closeToPerfectCopies_withCountry_plot_final) dev.off() ``` #### Divergent copies Divergent copies of the shared region ```{r} popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_homologousRegion %>% filter(s_Sample %!in% popClustering_filt_hrp3_pat1_regions_homologousRegion_conservedSum_closeToPerfectCopies$s_Sample) , minPopSize = 1) # select just the major haplotypes and cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies %>% group_by() %>% filter(samp_n > 0.9*max(samp_n)) %>% group_by(s_Sample, p_name) %>% # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp$s_Sample popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_dist) nameOrderFromAll = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_sp_dist_hclust$order] orderForDivergentCopy = nameOrderFromAll[nameOrderFromAll %in% rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat)] #rename the levels so they are in the order of the clustering popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies %>% mutate(s_Sample = factor(s_Sample, # levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_dist_hclust$order]))%>% levels = orderForDivergentCopy)) %>% mutate(popid = ifelse(maxPopid == 1, -1, popid)) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies$p_name)), labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies$p_name), expand = c(0,0))+ scale_y_continuous(expand = c(0,0)) meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies = meta_preferredSample %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies$s_Sample) %>% mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies$s_Sample))) allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])} previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_plot)$data[[1]][["fill"]]) names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies$popid)) previousColors["-1"] = "grey0"; allColors = c(allColors, previousColors) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies %>% mutate(s_Sample = factor(s_Sample, # levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_sp_dist_hclust$order])) %>% levels = orderForDivergentCopy)) %>% mutate(popid= factor(popid)) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$p_name))), labels = c("Chr11DupHapCluster", "continent", "region", "country", rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$p_name))) # levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$p_name) ), expand = c(0,0))+ scale_y_continuous( expand = c(0, 0), breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$s_Sample)), labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$s_Sample) ) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$s_Sample) %>% mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$s_Sample))) %>% mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0"))) %>% arrange(BiologicalSample) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry %>% select(Chr11DupHapClusterName, colors) %>% unique() %>% arrange(Chr11DupHapClusterName) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColorsDf$colors names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot + scale_fill_manual("Microhaplotype\nRank", values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))])) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= 0, xmax = -4.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies) + scale_fill_manual("country", values = rowAnnoColors[["country"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -5, xmax = -9.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies) + scale_fill_manual("region", values = rowAnnoColors[["region"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -10, xmax = -14.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies)+ scale_fill_manual("Continent", values = rowAnnoColors[["continent"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -15, xmax = -19.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry)+ # fill = factor(hcclust)), color = "black", data = jacardDist_gat_filt_sp_mat_pat1_hc_groups_df)+ # scale_fill_manual("HaploGroup", values = scheme$hex(length(unique(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups)))) + #scale_fill_manual("Chr11DupHapCluster", values = haploGroupColors, labels = names(haploGroupColors), breaks = names(haploGroupColors)) + scale_fill_manual("Chr11DupHapCluster", values = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColors), breaks = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_Chr11DupHapClusterColors)) + guides(fill = guide_legend(nrow = 4)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 20 regions_homologousRegion_filt = regions_homologousRegion %>% filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$p_name) %>% mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry$p_name))) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot_final = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot + new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -10, fill = description), data = regions_homologousRegion_filt, color = "black") + scale_fill_manual(values = descriptionColors_homologousRegion, guide = guide_legend(nrow = 2)) + labs(fill = "Genes\nDescription") + transparentBackground + theme(legend.text = element_text(size = 30), legend.title = element_text(size = 30, face = "bold"), legend.box="vertical", legend.margin=margin(), legend.background = element_blank(), legend.box.background = element_rect(colour = "black"), axis.text.x = element_text(size = 30)) print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot_final) ``` ```{r} pdf("popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_plot.pdf", useDingbats = F, width = 30, height = 30) print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_divergentCopies_withCountry_plot_final) dev.off() ``` #### Perfect chr11 copies The shared region of the strains with perfect chr11 copies. ```{r} popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies = HaplotypeRainbows::prepForRainbow( popClustering_filt_hrp3_pat1_regions_homologousRegion %>% filter( s_Sample %in% popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_conservedSum_closeToPerfectCopies$s_Sample ), minPopSize = 1 ) # select just the major haplotypes and cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies %>% group_by() %>% # filter(samp_n > 0.9*max(samp_n)) %>% group_by(s_Sample, p_name) %>% # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp$s_Sample popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust_orderDf = tibble( BiologicalSample = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust$order] ) %>% mutate(byGenomicRegionHclustOrder = row_number()) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% arrange(newClusterName) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_select = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$s_Sample) %>% left_join(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust_orderDf) %>% left_join(meta_preferredSample %>% select(BiologicalSample, country, subRegion, region, secondaryRegion)) %>% arrange(Chr11DupHapCluster, subRegion, country, byGenomicRegionHclustOrder) #rename the levels so they are in the order of the clustering popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies %>% mutate(s_Sample = factor(s_Sample, #levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust$order]))%>% levels = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_select$BiologicalSample) ) %>% mutate(popid = ifelse(maxPopid == 1, -1, popid)) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$p_name)), labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$p_name), expand = c(0,0))+ scale_y_continuous(expand = c(0,0)) meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies = meta_preferredSample %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$s_Sample) %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_select %>% select(BiologicalSample, newClusterName))%>% mutate(BiologicalSample = factor(BiologicalSample, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$s_Sample))) allColors = c(); for(name in names(rowAnnoColors)){ allColors = c(allColors, rowAnnoColors[[name]])} previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_plot)$data[[1]][["fill"]]) names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies$popid)) previousColors["-1"] = "grey0"; allColors = c(allColors, previousColors) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies %>% # mutate(s_Sample = factor(s_Sample, # levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_sp_dist_hclust$order])) %>% mutate(s_Sample = factor(s_Sample, levels = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_select$BiologicalSample)) %>% mutate(popid= factor(popid)) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = c(-19.5 + 2.25, -14.5 + 2.25, -9.5 + 2.25, -4.5 + 2.25, 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$p_name))), labels = c("Chr11DupHapCluster", "continent", "region", "country", rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$p_name))) # levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$p_name) ), expand = c(0,0))+ scale_y_continuous( expand = c(0, 0), breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$s_Sample)), labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$s_Sample) ) popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% filter(BiologicalSample %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$s_Sample) %>% mutate(BiologicalSample = factor(as.character(BiologicalSample), levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$s_Sample))) %>% mutate(Chr11DupHapClusterName = ifelse(hcclustSize == 1, "singlet", stringr::str_pad(newClusterName, width = 2, pad = "0"))) %>% arrange(BiologicalSample) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColorsDf = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry %>% select(Chr11DupHapClusterName, colors) %>% unique() %>% arrange(Chr11DupHapClusterName) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColors = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColorsDf$colors names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColors) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColorsDf$Chr11DupHapClusterName popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot + scale_fill_manual("Microhaplotype\nRank", values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))])) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= 0, xmax = -4.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = country), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies) + scale_fill_manual("country", values = rowAnnoColors[["country"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -5, xmax = -9.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = region), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies) + scale_fill_manual("region", values = rowAnnoColors[["region"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -10, xmax = -14.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = secondaryRegion), color = "black", data = meta_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies)+ scale_fill_manual("Continent", values = rowAnnoColors[["continent"]]) + guides(fill = guide_legend(nrow = 3)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin= -15, xmax = -19.5, ymin = as.numeric(BiologicalSample) - 0.5, ymax = as.numeric(BiologicalSample) + 0.5, fill = factor(Chr11DupHapClusterName)), color = "black", data = popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df_in_popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry)+ scale_fill_manual("Chr11DupHapCluster", values = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColors, labels = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColors), breaks = names(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_Chr11DupHapClusterColors)) + guides(fill = guide_legend(nrow = 4)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 20 regions_homologousRegion_filt = regions_homologousRegion %>% filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$p_name) %>% mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry$p_name))) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot_final = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot + new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -10, fill = description), data = regions_homologousRegion_filt, color = "black") + scale_fill_manual(values = descriptionColors_homologousRegion, guide = guide_legend(nrow = 2)) + labs(fill = "Genes\nDescription") + transparentBackground + theme(legend.text = element_text(size = 30), legend.title = element_text(size = 30, face = "bold"), legend.box="vertical", legend.margin=margin(), legend.background = element_blank(), legend.box.background = element_rect(colour = "black"), axis.text.x = element_text(size = 30)) print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot_final) ``` ```{r} pdf("popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_plot.pdf", useDingbats = F, width = 30, height = 30) print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_perfectChr11Copies_withCountry_plot_final) dev.off() ``` ### Sub set ### SD01, HB3, Santa-Lucia-Salvador-I ```{r} popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates = HaplotypeRainbows::prepForRainbow(popClustering_filt_hrp3_pat1_regions_homologousRegion %>% filter(s_Sample %in% c("HB3", "SD01", "Santa-Lucia-Salvador-I")) , minPopSize = 1) # select just the major haplotypes and cluster based on the sharing between popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates %>% group_by(p_name) %>% mutate(sampleCount = length(unique(s_Sample)))%>% group_by()%>% filter(sampleCount > 0.9*max(sampleCount)) %>% group_by(s_Sample, p_name) %>% # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_mat = as.matrix(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp[,2:ncol(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp)]) rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_mat) = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp$s_Sample popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_dist = dist(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_mat) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_dist_hclust = hclust(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_dist) #rename the levels so they are in the order of the clustering popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_mat)[popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sp_dist_hclust$order])) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name)), #labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name), labels = rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name))), expand = c(0,0))+ scale_y_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$s_Sample)), labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$s_Sample), expand = c(0,0)) previousColors = unique(ggplot_build(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot)$data[[1]][["fill"]]) names(previousColors) = sort(unique(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$popid)) previousColors["-1"] = "grey0"; popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot = genRainbowHapPlotObj(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates %>% mutate(popid= factor(popid)), colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name)), #labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name), labels = rep("", length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name))), expand = c(0,0))+ scale_y_continuous(breaks = 1:length(levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$s_Sample)), labels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$s_Sample), expand = c(0,0)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 5 regions_homologousRegion_filt = regions_homologousRegion %>% filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name) %>% mutate(genomicID = factor(genomicID, levels = levels(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates$p_name))) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot_final = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot + scale_fill_manual("Microhaplotype\nRank", values = haplotypeRankColors[sort(names(previousColors))], labels = names(haplotypeRankColors[sort(names(previousColors))]), breaks = names(haplotypeRankColors[sort(names(previousColors))])) + guides(fill = guide_legend(nrow = 1)) + ggnewscale::new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -1, fill = description), data = regions_homologousRegion_filt, color = "black") + scale_fill_manual(values = descriptionColors_homologousRegion, guide = guide_legend(nrow = 2)) + labs(fill = "Genes\nDescription") + transparentBackground + theme(legend.text = element_text(size = 30), legend.title = element_text(size = 30, face = "bold"), legend.box="vertical", legend.margin=margin(), legend.background = element_blank(), legend.box.background = element_rect(colour = "black"), axis.text.x = element_text(size = 30), axis.text.y = element_text(size = 30)) print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot_final) ``` ```{r} pdf("popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot.pdf", useDingbats = F, width = 30, height = 10) print(popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_plot_final) dev.off() ``` ### Getting regions with SD01 Multi In Shared Outputting the regions within the shared region where SD01 has multiple variants ```{r} popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sd01 = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates %>% filter(s_Sample == "SD01") %>% group_by(s_Sample, p_name) %>% mutate(s_COI = length(unique(h_popUID))) popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sd01_multi = popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sd01 %>% filter(s_COI > 1) regions_withSD01MultiInShared = regions %>% filter(genomicID %in% popClustering_filt_hrp3_pat1_regions_homologousRegion_prep_LabIsolates_sd01_multi$p_name) write_tsv(regions_withSD01MultiInShared, "regions_withSD01MultiInShared.bed") finalHrpSubwindows_regions_withSD01MultiInShared = finalHrpSubwindows %>% filter(X4 %in% regions_withSD01MultiInShared$name) write_tsv(finalHrpSubwindows_regions_withSD01MultiInShared, "finalHrpSubwindows_regions_withSD01MultiInShared.bed", col_names = F) ``` ### Plottng Shared Region for Pacbio Genomes ```{r} popClustering_labIso = popClustering %>% left_join(meta %>% rename(s_Sample = BiologicalSample)) %>% filter(grepl("^Pf", sample)) popClustering_labIso_homologousRegion = popClustering_labIso %>% filter(p_name %in% regions_homologousRegion$genomicID) popClustering_labIso_homologousRegion_prep = HaplotypeRainbows::prepForRainbow(popClustering_labIso_homologousRegion, minPopSize = 1) ``` ```{r} # select just the major haplotypes and cluster based on the sharing between popClustering_labIso_homologousRegion_prep_sp = popClustering_labIso_homologousRegion_prep %>% group_by(p_name) %>% mutate(sampleCount = length(unique(s_Sample)))%>% group_by()%>% filter(sampleCount > 0.9*max(sampleCount)) %>% group_by(s_Sample, p_name) %>% # filter(c_AveragedFrac == max(c_AveragedFrac)) %>% mutate(marker = 1) %>% group_by() %>% select(h_popUID, marker, s_Sample) %>% spread(h_popUID, marker, fill = 0) popClustering_labIso_homologousRegion_prep_sp_mat = as.matrix(popClustering_labIso_homologousRegion_prep_sp[,2:ncol(popClustering_labIso_homologousRegion_prep_sp)]) rownames(popClustering_labIso_homologousRegion_prep_sp_mat) = popClustering_labIso_homologousRegion_prep_sp$s_Sample popClustering_labIso_homologousRegion_prep_sp_dist = dist(popClustering_labIso_homologousRegion_prep_sp_mat) popClustering_labIso_homologousRegion_prep_sp_dist_hclust = hclust(popClustering_labIso_homologousRegion_prep_sp_dist) #rename the levels so they are in the order of the clustering popClustering_labIso_homologousRegion_prep = popClustering_labIso_homologousRegion_prep %>% mutate(s_Sample = factor(s_Sample, levels = rownames(popClustering_labIso_homologousRegion_prep_sp_mat)[popClustering_labIso_homologousRegion_prep_sp_dist_hclust$order])) popClustering_labIso_homologousRegion_prep_plot = genRainbowHapPlotObj(popClustering_labIso_homologousRegion_prep, colorCol = popid) + theme(axis.text.x = element_text(size=12, angle = -90, vjust = 0.5, hjust = 0)) + scale_x_continuous(breaks = 1:length(levels(popClustering_labIso_homologousRegion_prep$p_name)), labels = levels(popClustering_labIso_homologousRegion_prep$p_name), expand = c(0,0))+ scale_y_continuous(expand = c(0,0)) ``` The y axis is samples and the x-axis is sub region within the chromosome, sorted by genomic position. Haplotypes are colored by their abundance rank and while colors in a vertical column are the same haplotype, the same colors between column do not mean same haplotype, Rank is ordered by frequency within the total population. Columns with black bars are columns where there is no haplotype variation. Bar heights are relative to abundance within sample, so a sample with just 1 bar for a genomic position means monoclonal at this position while multiple bars would indicated polyclonal ```{r} #| fig-column: screen #| fig-width: 30 #| fig-height: 5 regions_homologousRegion_filt = regions_homologousRegion %>% filter(genomicID %in% popClustering_labIso_homologousRegion_prep$p_name) %>% mutate(genomicID = factor(genomicID, levels = levels(popClustering_labIso_homologousRegion_prep$p_name))) print(popClustering_labIso_homologousRegion_prep_plot + new_scale_fill() + geom_rect(aes(xmin = as.numeric(genomicID) - 0.5, xmax = as.numeric(genomicID) + 0.5, ymax = 0, ymin = -1, fill = description), data = regions_homologousRegion_filt, color = "black") + scale_fill_manual(values = descriptionColors_homologousRegion, guide = guide_legend(nrow = 2)) ) ``` # Plotting whole genome inter-relatednesss between strains with deletions ```{r, echo=T, eval=T} allSel = readr::read_tsv("/Users/nick/Dropbox (Personal)/ownCloud/documents/plasmodium/falciparum/pfepipanels/Pf_Epi_Panels/data/MIPSIBC/data/pf/reports/slim_allSelectedClustersInfo.tab.txt.gz") allSel_filt = allSel %>% filter(s_Sample %in% previousDeletionCalls$BiologicalSample) write_tsv(allSel_filt, "MIPSIBC_previousDeletionCalls_samples.tsv") ``` ```{r, echo=T, eval=T} allSel = readr::read_tsv("/Users/nick/Dropbox (Personal)/ownCloud/documents/plasmodium/falciparum/pfepipanels/Pf_Epi_Panels/data/heome1/data/pf/reports/slim_allSelectedClustersInfo.tab.txt.gz") allSel_filt = allSel %>% filter(s_Sample %in% previousDeletionCalls$BiologicalSample) write_tsv(allSel_filt, "heome1_previousDeletionCalls_samples.tsv") ``` ```{bash, echo=T, eval=F} elucidator doPairwiseComparisonOnHapsSharing --tableFnp heome1_previousDeletionCalls_samples.tsv --sampleCol s_Sample --targetNameCol p_name --popIDCol h_popUID --relAbundCol c_AveragedFrac --numThreads 12 --dout heome1_previousDeletionCalls_samples_pairwiseComps --verbose --overWriteDir --metaFnp /tank/data/plasmodium/falciparum/pfdata/metadata/metaByBiosample.tab.txt --metaFieldsToCalcPopDiffs country,region,secondaryRegion --writeOutDistMatrices elucidator doPairwiseComparisonOnHapsSharing --tableFnp MIPSIBC_previousDeletionCalls_samples.tsv --sampleCol s_Sample --targetNameCol p_name --popIDCol h_popUID --relAbundCol c_AveragedFrac --numThreads 12 --dout MIPSIBC_previousDeletionCalls_samples_pairwiseComps --verbose --overWriteDir --metaFnp /tank/data/plasmodium/falciparum/pfdata/metadata/metaByBiosample.tab.txt --metaFieldsToCalcPopDiffs country,region,secondaryRegion --writeOutDistMatrices ``` ## Heome1 ```{r, echo=T, eval=T} sample_metadata_withAllDeletionCalls=readr::read_tsv("sample_metadata_withAllDeletionCalls.tsv") heome1_previousDeletionCalls_samples_jacardByHapsTarShared_samples = readr::read_tsv("heome1_previousDeletionCalls_samples_pairwiseComps/sampleNames.tab.txt", col_names = F) heome1_previousDeletionCalls_samples_jacardByHapsTarShared = readr::read_tsv("heome1_previousDeletionCalls_samples_pairwiseComps/jacardByHapsTarShared.tab.txt.gz", col_names = F) heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat = as.matrix(heome1_previousDeletionCalls_samples_jacardByHapsTarShared) colnames(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat) = heome1_previousDeletionCalls_samples_jacardByHapsTarShared_samples$X1 rownames(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat) = heome1_previousDeletionCalls_samples_jacardByHapsTarShared_samples$X1 heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat = heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat[metaSelected$BiologicalSample,metaSelected$BiologicalSample] col_fun = colorRamp2(c(min(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat), min(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat) + (1-min(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat))/2, 1), c( "#2166ac", "white", "#b2182b")) previousDeletionCalls_sel = previousDeletionCalls[match(colnames(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat), previousDeletionCalls$BiologicalSample),]%>% left_join(sample_metadata_withAllDeletionCalls %>% select(sample, Pattern) %>% rename(BiologicalSample = sample)) %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% mutate(BiologicalSample = as.character(BiologicalSample)) %>% select(BiologicalSample, newClusterName)) rowAnnoDf = previousDeletionCalls_sel[, c( "country", "region", "secondaryRegion", "newClusterName", "Pattern")] %>% rename(continent = secondaryRegion, Chr11DupHapCluster = newClusterName) %>% as.data.frame() rowAnnoColors = createColorListFromDf(rowAnnoDf) load("rowAnnoColors.Rdata") rowAnnoColors_mod = rowAnnoColors rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors rowAnnoColors_mod[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern annotationTextSize = 25 ;annotationTitleTextSize = 20; topAnno = HeatmapAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, show_legend = F, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = "#FFFFFF00" ) sideAnno = rowAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = "#FFFFFF00" ) heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm = Heatmap( heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat, col = col_fun, name = "JacardIndex", top_annotation = topAnno, left_annotation = sideAnno, row_dend_width = unit(5, "cm"), column_dend_height = unit(5, "cm"), na_col = "#FFFFFF00", heatmap_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar( fontsize = annotationTextSize, fontface = "bold", title = "JacardIndex" ) ) ) ``` ```{r} #| fig-column: screen #| fig-width: 25 #| fig-height: 15 draw(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") ``` ```{r} pdf("heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm.pdf", width = 25, height = 30, useDingbats = F) draw(heome1_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") dev.off() ``` ## MIPSIBC ```{r, echo=T, eval=T} sample_metadata_withAllDeletionCalls=readr::read_tsv("sample_metadata_withAllDeletionCalls.tsv") MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_samples = readr::read_tsv("MIPSIBC_previousDeletionCalls_samples_pairwiseComps/sampleNames.tab.txt", col_names = F) MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared = readr::read_tsv("MIPSIBC_previousDeletionCalls_samples_pairwiseComps/jacardByHapsTarShared.tab.txt.gz", col_names = F) MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat = as.matrix(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared) colnames(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat) = MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_samples$X1 rownames(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat) = MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_samples$X1 MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat = MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat[metaSelected$BiologicalSample,metaSelected$BiologicalSample] col_fun = colorRamp2(c(min(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat), min(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat) + (1-min(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat))/2, 1), c( "#2166ac", "white", "#b2182b")) previousDeletionCalls_sel = previousDeletionCalls[match(colnames(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat), previousDeletionCalls$BiologicalSample),]%>% left_join(sample_metadata_withAllDeletionCalls %>% select(sample, Pattern) %>% rename(BiologicalSample = sample)) %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% mutate(BiologicalSample = as.character(BiologicalSample)) %>% select(BiologicalSample, newClusterName)) rowAnnoDf = previousDeletionCalls_sel[, c( "country", "region", "secondaryRegion", "newClusterName", "Pattern")] %>% rename(continent = secondaryRegion, Chr11DupHapCluster = newClusterName) %>% as.data.frame() rowAnnoColors = createColorListFromDf(rowAnnoDf) load("rowAnnoColors.Rdata") rowAnnoColors_mod = rowAnnoColors rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors rowAnnoColors_mod[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern annotationTextSize = 25 ;annotationTitleTextSize = 20; topAnno = HeatmapAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, show_legend = F, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = "#FFFFFF00" ) sideAnno = rowAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = "#FFFFFF00" ) MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_nolabs = MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat rownames(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_nolabs) = NULL colnames(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_nolabs) = NULL MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm = Heatmap( MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_nolabs, col = col_fun, name = "JacardIndex", top_annotation = topAnno, left_annotation = sideAnno, row_dend_width = unit(5, "cm"), column_dend_height = unit(5, "cm"), na_col = "#FFFFFF00", heatmap_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar( fontsize = annotationTextSize, fontface = "bold", title = "JacardIndex" ) ) ) ``` ```{r} #| fig-column: screen #| fig-width: 25 #| fig-height: 15 draw(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") ``` ```{r} pdf("MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm.pdf", width = 25, height = 30, useDingbats = F) draw(MIPSIBC_previousDeletionCalls_samples_jacardByHapsTarShared_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") dev.off() ``` ## hmmIBD ```{r} sample_metadata_withAllDeletionCalls_sel = sample_metadata_withAllDeletionCalls %>% select(sample, SRARuns) %>% filter(sample %in% previousDeletionCalls$BiologicalSample) %>% mutate(SRARuns = strsplit(SRARuns, split = ",")) %>% unnest(SRARuns) hmm_fract = readr::read_tsv("filtered_gatk_calls_database_hrpsallMetaDeletionCalls.hmm_fract.txt") hmm_fract_combined = bind_rows( hmm_fract %>% arrange(sample1, sample2) %>% select(sample1, sample2, fract_sites_IBD), hmm_fract %>% arrange(sample1, sample2) %>% select(sample1, sample2, fract_sites_IBD) %>% rename(temp1 = sample2, temp2 = sample1)%>% rename(sample1 = temp1, sample2 = temp2) ) hmm_fract_combined_samples = tibble(sample1 = unique(hmm_fract_combined$sample1)) %>% left_join(sample_metadata_withAllDeletionCalls_sel %>% rename(BiologicalSample = sample, sample1 = SRARuns)) %>% mutate(BiologicalSample = ifelse(is.na(BiologicalSample), sample1, BiologicalSample)) %>% mutate(BiologicalSample = ifelse(BiologicalSample == "fcr3", "FCR3", BiologicalSample)) perSampleVarCounts = readr::read_tsv("filtered_hrpsallMetaDeletionCalls_variants_perSampleCounts.tsv") %>% rename(sample1 = `[3]sample`, nMissing = `[14]nMissing`) hmm_fract_combined_samples_filt = hmm_fract_combined_samples %>% left_join(perSampleVarCounts %>% select(sample1, nMissing)) %>% group_by(BiologicalSample) %>% arrange(BiologicalSample, nMissing) %>% mutate(rank = row_number(), totalSamples = n()) %>% filter(rank == 1) hmm_fract_combined_filt = hmm_fract_combined %>% filter(sample1 %in% hmm_fract_combined_samples_filt$sample1, sample2 %in% hmm_fract_combined_samples_filt$sample1) %>% left_join(hmm_fract_combined_samples_filt %>% rename(BiologicalSample1 = BiologicalSample) %>% select(sample1, BiologicalSample1))%>% left_join(hmm_fract_combined_samples_filt %>% rename( sample2 = sample1, BiologicalSample2 = BiologicalSample) %>% select(sample2, BiologicalSample2)) hmm_fract_sp = hmm_fract_combined_filt %>% select(BiologicalSample1, BiologicalSample2, fract_sites_IBD) %>% spread(BiologicalSample2, fract_sites_IBD, fill = 1) hmm_fract_sp_mat = as.matrix(hmm_fract_sp[,2:ncol(hmm_fract_sp)]) rownames(hmm_fract_sp_mat) = hmm_fract_sp$BiologicalSample1 ``` ```{r} library(circlize) hmm_fract_sp_mat = hmm_fract_sp_mat[metaSelected$BiologicalSample, metaSelected$BiologicalSample] # col_fun = colorRamp2(c(min(hmm_fract_sp_mat), min(hmm_fract_sp_mat) + (1-min(hmm_fract_sp_mat))/2, 1), c( "#2166ac", "white", "#b2182b")) col_fun = colorRamp2(c(0, 0.5, 1), c( "#2166ac", "white", "#b2182b")) previousDeletionCalls_sel = previousDeletionCalls[match(colnames(hmm_fract_sp_mat), previousDeletionCalls$BiologicalSample),]%>% left_join(sample_metadata_withAllDeletionCalls %>% select(sample, Pattern) %>% rename(BiologicalSample = sample)) %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% mutate(BiologicalSample = as.character(BiologicalSample)) %>% select(BiologicalSample, newClusterName)) %>% mutate(Pattern = ifelse(is.na(Pattern) & possiblyHRP2Deleted, "8-TARE1", Pattern)) %>% mutate(Pattern = ifelse("FCR3" == BiologicalSample, "13++11-", Pattern)) rowAnnoDf = previousDeletionCalls_sel[, c( "country", "region", "secondaryRegion", "newClusterName", "Pattern", "hrpCall")] %>% rename(continent = secondaryRegion, Chr11DupHapCluster = newClusterName) %>% as.data.frame() rowAnnoColors = createColorListFromDf(rowAnnoDf) load("rowAnnoColors.Rdata") rowAnnoColors_mod = rowAnnoColors rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors rowAnnoColors_mod[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern annotationTextSize = 25 ;annotationTitleTextSize = 20; topAnno = HeatmapAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, show_legend = F, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = "#FFFFFF00" ) sideAnno = rowAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = "#FFFFFF00" ) hmm_fract_sp_mat_nolabs = hmm_fract_sp_mat rownames(hmm_fract_sp_mat_nolabs) = NULL colnames(hmm_fract_sp_mat_nolabs) = NULL hmm_fract_sp_mat_hm = Heatmap( hmm_fract_sp_mat_nolabs, col = col_fun, name = "fracIBDSites", top_annotation = topAnno, left_annotation = sideAnno, row_dend_width = unit(5, "cm"), column_dend_height = unit(5, "cm"), na_col = "#FFFFFF00" , heatmap_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar( fontsize = annotationTextSize, fontface = "bold", title = "fracIBDSites" ) ) ) ``` ```{r} #| fig-column: screen #| fig-width: 27 #| fig-height: 30 draw(hmm_fract_sp_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") ``` ```{r} pdf("hmmIBD_fract_sp_mat_hm.pdf", width = 25, height = 30, useDingbats = F) draw(hmm_fract_sp_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") dev.off() ``` ### hmmIBD for just 13-11++ parasites ```{r} hmm_fract_sp_pat1 = hmm_fract_combined_filt %>% filter(BiologicalSample1 %in% metaSelected_hrp3_pat1$BiologicalSample, BiologicalSample2 %in% metaSelected_hrp3_pat1$BiologicalSample) %>% select(BiologicalSample1, BiologicalSample2, fract_sites_IBD) %>% spread(BiologicalSample2, fract_sites_IBD, fill = 1) hmm_fract_sp_pat1_mat = as.matrix(hmm_fract_sp_pat1[,2:ncol(hmm_fract_sp_pat1)]) rownames(hmm_fract_sp_pat1_mat) = hmm_fract_sp_pat1$BiologicalSample1 ``` ```{r} library(circlize) # col_fun = colorRamp2(c(min(hmm_fract_sp_pat1_mat), min(hmm_fract_sp_pat1_mat) + (1-min(hmm_fract_sp_pat1_mat))/2, 1), c( "#2166ac", "white", "#b2182b")) col_fun = colorRamp2(c(0, 0.5, 1), c( "#2166ac", "white", "#b2182b")) previousDeletionCalls_sel = previousDeletionCalls[match(colnames(hmm_fract_sp_pat1_mat), previousDeletionCalls$BiologicalSample),]%>% left_join(sample_metadata_withAllDeletionCalls %>% select(sample, Pattern) %>% rename(BiologicalSample = sample)) %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% mutate(BiologicalSample = as.character(BiologicalSample)) %>% select(BiologicalSample, newClusterName)) rowAnnoDf = previousDeletionCalls_sel[, c( "country", "region", "secondaryRegion", "newClusterName", # "Pattern", "hrpCall")] %>% rename(continent = secondaryRegion, Chr11DupHapCluster = newClusterName) %>% as.data.frame() rowAnnoColors = createColorListFromDf(rowAnnoDf) load("rowAnnoColors.Rdata") rowAnnoColors_mod = rowAnnoColors rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors rowAnnoColors_mod[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern annotationTextSize = 25 ;annotationTitleTextSize = 20; topAnno = HeatmapAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, show_legend = F, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = "#FFFFFF00" ) sideAnno = rowAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = "#FFFFFF00" ) hmm_fract_sp_pat1_mat_nolabs = hmm_fract_sp_pat1_mat rownames(hmm_fract_sp_pat1_mat_nolabs) = NULL colnames(hmm_fract_sp_pat1_mat_nolabs) = NULL hmm_fract_sp_pat1_mat_hm = Heatmap( hmm_fract_sp_pat1_mat_nolabs, col = col_fun, name = "fracIBDSites", top_annotation = topAnno, left_annotation = sideAnno, row_dend_width = unit(5, "cm"), column_dend_height = unit(5, "cm"), na_col = "#FFFFFF00" , heatmap_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar( fontsize = annotationTextSize, fontface = "bold", title = "fracIBDSites" ) ) ) ``` ```{r} #| fig-column: screen #| fig-width: 27 #| fig-height: 30 draw(hmm_fract_sp_pat1_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") ``` ```{r} pdf("hmmIBD_fract_sp_mat_hm_pat1.pdf", width = 25, height = 25, useDingbats = F) draw(hmm_fract_sp_pat1_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") dev.off() ``` ### hmmIBD for just 13-5++ parasites ```{r} sample_metadata_withAllDeletionCalls_13_5_pattern = sample_metadata_withAllDeletionCalls %>% filter(Pattern == "13-5++") hmm_fract_sp_13_5_pattern = hmm_fract_combined_filt %>% filter(BiologicalSample1 %in% sample_metadata_withAllDeletionCalls_13_5_pattern$sample, BiologicalSample2 %in% sample_metadata_withAllDeletionCalls_13_5_pattern$sample) %>% select(BiologicalSample1, BiologicalSample2, fract_sites_IBD) %>% spread(BiologicalSample2, fract_sites_IBD, fill = 1) hmm_fract_sp_13_5_pattern_mat = as.matrix(hmm_fract_sp_13_5_pattern[,2:ncol(hmm_fract_sp_13_5_pattern)]) rownames(hmm_fract_sp_13_5_pattern_mat) = hmm_fract_sp_13_5_pattern$BiologicalSample1 ``` ```{r} library(circlize) # col_fun = colorRamp2(c(min(hmm_fract_sp_13_5_pattern_mat), min(hmm_fract_sp_13_5_pattern_mat) + (1-min(hmm_fract_sp_13_5_pattern_mat))/2, 1), c( "#2166ac", "white", "#b2182b")) col_fun = colorRamp2(c(0, 0.5, 1), c( "#2166ac", "white", "#b2182b")) previousDeletionCalls_sel = previousDeletionCalls[match(colnames(hmm_fract_sp_13_5_pattern_mat), previousDeletionCalls$BiologicalSample),]%>% left_join(sample_metadata_withAllDeletionCalls %>% select(sample, Pattern) %>% rename(BiologicalSample = sample)) %>% left_join(popClustering_filt_hrp3_pat1_regions_afterHomologous_chr11_prep_forClustering_sp_dist_hclust_groups_df %>% ungroup() %>% mutate(newClusterName = ifelse(hcclustSize == 1, "singlet", as.character(stringr::str_pad(newClusterName, width = 2, pad = "0")))) %>% mutate(BiologicalSample = as.character(BiologicalSample)) %>% select(BiologicalSample, newClusterName)) rowAnnoDf = previousDeletionCalls_sel[, c( "country", "region", "secondaryRegion" # "newClusterName", # "Pattern", # "hrpCall" )] %>% rename(continent = secondaryRegion) %>% as.data.frame() rowAnnoColors = createColorListFromDf(rowAnnoDf) load("rowAnnoColors.Rdata") rowAnnoColors_mod = rowAnnoColors # rowAnnoColors_mod[["Chr11DupHapCluster"]] = newHaploGroupWithSingletColors rowAnnoColors_mod[["Pattern"]] = rowAnnoColorsMod_hrp3DeletionPattern annotationTextSize = 25 ;annotationTitleTextSize = 20; topAnno = HeatmapAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, show_legend = F, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = "#FFFFFF00" ) sideAnno = rowAnnotation( df = rowAnnoDf, col = rowAnnoColors_mod, gp = gpar(col = "grey10"), annotation_name_gp = gpar(fontsize = annotationTitleTextSize), annotation_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar(fontsize = annotationTextSize, fontface = "bold") ), na_col = "#FFFFFF00" ) hmm_fract_sp_13_5_pattern_mat_nolabs = hmm_fract_sp_13_5_pattern_mat rownames(hmm_fract_sp_13_5_pattern_mat_nolabs) = NULL colnames(hmm_fract_sp_13_5_pattern_mat_nolabs) = NULL hmm_fract_sp_13_5_pattern_mat_hm = Heatmap( hmm_fract_sp_13_5_pattern_mat_nolabs, col = col_fun, name = "fracIBDSites", top_annotation = topAnno, left_annotation = sideAnno, row_dend_width = unit(5, "cm"), column_dend_height = unit(5, "cm"), na_col = "#FFFFFF00" , heatmap_legend_param = list( labels_gp = gpar(fontsize = annotationTextSize), title_gp = gpar( fontsize = annotationTextSize, fontface = "bold", title = "fracIBDSites" ) ) ) ``` ```{r} #| fig-column: screen #| fig-width: 15 #| fig-height: 15 draw(hmm_fract_sp_13_5_pattern_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") ``` ```{r} pdf("hmmIBD_fract_sp_mat_hm_13_5_pattern.pdf", width = 15, height = 15, useDingbats = F) draw(hmm_fract_sp_13_5_pattern_mat_hm, background = "transparent", merge_legend = TRUE, heatmap_legend_side = "bottom", annotation_legend_side = "bottom") dev.off() ``` ## moire ```{r, echo=T, eval = F} remotes::install_github("EPPIcenter/moire") setwd("/tank/data/plasmodium/falciparum/pfdata/moire_on_hrp3Samples/") df <- read.tsv("allSel_withDeletions_prep_outForMoire.tsv") data <- load_long_form_data(df) # With data in appropriate format, run MCMC as follows mcmc_results <- moire::run_mcmc(data, is_missing = data$is_missing) write_rds(mcmc_results, "mcmc_results.rds") write_rds(data, "data_for_moire.rds") ``` ```{r} data_for_moire = read_rds("moire_on_hrp3Samples/data_for_moire.rds") mcmc_results = read_rds("moire_on_hrp3Samples/mcmc_results.rds") coiEsts = tibble( sampleID = data_for_moire$sample_ids, medianCOI = round(unlist(lapply(mcmc_results$chains[[1]]$coi, median))), meanCOI = round(unlist(lapply(mcmc_results$chains[[1]]$coi, mean))), maxCOI = round(unlist(lapply(mcmc_results$chains[[1]]$coi, max))) ) ```