Code
wget https://seekdeep.brown.edu/data/plasmodiumData/pf.tar.gz
tar -zxvf pf.tar.gz
wget https://seekdeep.brown.edu/data/plasmodiumData/pf_plusPfSD01.tar.gz
tar -zxvf pf_plusPfSD01.tar.gz
Below maps out the annotations of genes on the pacbio assembled genomes from wellcome trust(Otto et al. 2018)
Which can be downloaded via:
cd surroundingRegionsMaterials
elucidator gffRecordIDToGeneInfo --id PF3D7_0831800,PF3D7_1372200 --dout hrps_geneInfos --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/Pf3D7.gff --2bit /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf3D7.2bit --overWriteDir
elucidator extractRefSeqsFromGenomes --bed hrps_geneInfos/out_PF3D7_0831800.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedHRPII --overWriteDirs --numThreads 7 --extendAndTrim --coverage 95 --keepBestOnly --identity 90
elucidator extractRefSeqsFromGenomes --bed hrps_geneInfos/out_PF3D7_1372200.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedHRPIII --overWriteDirs --numThreads 7 --extendAndTrim --coverage 95 --keepBestOnly --identity 90
## create hmm models
muscle -in extractedHRPII/Pf3D7_08_v3-1374235-1375299-rev/allRefs.fasta -out extractedHRPII/Pf3D7_08_v3-1374235-1375299-rev/aln_allRefs.fasta
hmmbuild --symfrac 0.25 -n PfHRP2 hmm_PfHRP2.txt extractedHRPII/Pf3D7_08_v3-1374235-1375299-rev/aln_allRefs.fasta
muscle -in extractedHRPIII/Pf3D7_13_v3-2840726-2841703-rev/allRefs.fasta -out extractedHRPIII/Pf3D7_13_v3-2840726-2841703-rev/aln_allRefs.fasta
hmmbuild --symfrac 0.25 -n PfHRP3 hmm_PfHRP3.txt extractedHRPIII/Pf3D7_13_v3-2840726-2841703-rev/aln_allRefs.fasta
cat hmm_PfHRP2.txt hmm_PfHRP3.txt > hmms_PfHRPs.txt
rm -f searchRefGenomesForHmmsCmds.txt && for x in /tank/data/genomes/plasmodium/genomes/plasmodiumRefGenomes/genomes/P*.fasta; do echo elucidator runnhmmscan --hmmModel hmms_PfHRPs.txt --fasta ${x} --overWriteDir --trimAtWhiteSpace --dout hmm_hrps_against$(basename ${x%%.fasta}) --defaultParameters=\"--nonull2 --incT 50 --incdomT 50 -T 50 --notextw --cpu 4\" --hardEvalueCutOff 1e-50 >> searchRefGenomesForHmmsCmds.txt ; done;
nohup elucidator runMultipleCommands --cmdFile searchRefGenomesForHmmsCmds.txt --numThreads 4 --raw &
Genes were chosen to include far upstream of the regions of interest on chromosomes 8 (PF3D7_0830300), 11 (PF3D7_1147700) and 13 (PF3D7_1369500) to investigate from there to the ends of the chromosomes
/bin/ls /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf*.fasta | sed 's/.fasta//g' | sed 's/.*\///g' > strains.txt
# changing from PF3D7_1370300 to PF3D7_1369500 to get similar amount before the rRNA split between 11 and 13
elucidator gffRecordIDToGeneInfo --id PF3D7_0830300,PF3D7_1370300,PF3D7_1369500 --dout hrps_upStreamGeneInfos --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/Pf3D7.gff --2bit /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf3D7.2bit --overWriteDir
elucidator extractRefSeqsFromGenomes --bed hrps_upStreamGeneInfos/out_PF3D7_0830300.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedPF3D7_0830300 --overWriteDirs --numThreads 15 --extendAndTrim
elucidator extractRefSeqsFromGenomes --bed hrps_upStreamGeneInfos/out_PF3D7_1369500.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedPF3D7_1369500 --overWriteDirs --numThreads 15 --extendAndTrim
# chrom 11
elucidator gffRecordIDToGeneInfo --id PF3D7_1147700 --dout chrom11_upStreamOfrRNA --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/Pf3D7.gff --2bit /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf3D7.2bit --overWriteDir
elucidator extractRefSeqsFromGenomes --bed chrom11_upStreamOfrRNA/out_PF3D7_1147700.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedPF3D7_1147700 --overWriteDirs --numThreads 7 --extendAndTrim
Get chromosome lengths to know where to extend to
Extend from the genes of interest to the ends of the chromosomes
mkdir endBeds
for x in /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf*.fasta; do elucidator extendToEndOfChrom --bed extractedPF3D7_0830300/Pf3D7_08_v3-1290239-1291406-rev/beds/$(basename ${x%%.fasta})_region.bed --chromLengthsTable chromLengths/$(basename ${x%%.fasta}).txt | cut -f 1-3 | elucidator bed3ToBed6 --bed STDIN --out endBeds/$(basename ${x%%.fasta})_chrom08_toEnd.bed --overWrite ; done;
for x in /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf*.fasta; do elucidator extendToEndOfChrom --bed extractedPF3D7_1369500/Pf3D7_13_v3-2769915-2773480-for/beds/$(basename ${x%%.fasta})_region.bed --chromLengthsTable chromLengths/$(basename ${x%%.fasta}).txt | cut -f 1-3 | elucidator bed3ToBed6 --bed STDIN --out endBeds/$(basename ${x%%.fasta})_chrom13_toEnd.bed --overWrite ; done;
for x in /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf*.fasta; do elucidator extendToEndOfChrom --bed extractedPF3D7_1147700/Pf3D7_11_v3-1897203-1897884-for/beds/$(basename ${x%%.fasta})_region.bed --chromLengthsTable chromLengths/$(basename ${x%%.fasta}).txt | cut -f 1-3 | elucidator bed3ToBed6 --bed STDIN --out endBeds/$(basename ${x%%.fasta})_chrom11_toEnd.bed --overWrite ; done;
cd endBeds
for x in *toEnd.bed; do elucidator gffToBedByBedLoc --extraAttributes description --feature gene,pseudogene --bed ${x} --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/$(echo $x | sed 's/_.*//g').gff --overWrite --out ${x%%.bed}_genes.bed; done;
for x in *toEnd.bed; do elucidator gffToBedByBedLoc --extraAttributes description --bed ${x} --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/$(echo $x | sed 's/_.*//g').gff --overWrite --out ${x%%.bed}_all.bed; done;
for x in *toEnd_genes.bed; do elucidator splitColumnContainingMeta --addHeader --delim tab --overWrite --removeEmptyColumn --column col.6 --file ${x} --out split_${x%%.bed}.tab.txt ; done;
strains = readr::read_tsv("surroundingRegionsMaterials/strains.txt", col_names = c("strains")) %>%
filter("PfIT" != strains)
chromLengths = tibble()
for(strain in strains$strains){
chromLensFnp = paste0("surroundingRegionsMaterials/chromLengths/", strain, ".txt")
strainLens = readr::read_tsv(chromLensFnp, col_names = c("chrom", "length")) %>%
mutate(strain = strain)
chromLengths = bind_rows(chromLengths, strainLens)
}
Determine tandem repeats in all genomes to determine the presence of telomere associated repeitive elements (TARE)(Vernick and McCutchan 1988)
Run tandem repeat finder (Benson 1999)
mkdir -p surroundingRegionsMaterials/alltrfs
cd surroundingRegionsMaterials/alltrfs
rm -f runTRFCmds.txt && for x in /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/*.fasta; do echo elucidator runTRF --trimAtWhiteSpace --supplement --overWriteDir --fasta ${x} --dout trf_$(basename ${x%%.fasta}) >> runTRFCmds.txt ; done;
nohup elucidator runMultipleCommands --cmdFile runTRFCmds.txt --numThreads 16 --raw &
The ends of the telomeres have six blocks of repetitive sequences that were designated telomere-associated repetitive elements (TAREs 1–6) arranged in specific structures(Vernick and McCutchan 1988).
All plasmodium falciparum telomeres end with a 7bp tandem repeat termed TARE-1 Subtelomeric block 1 (SB-1, equivalent to TARE-1), contains the 7-bp telomeric repeat in a variable number of near-exact copies represented by the consensus, TT(T/C)AGGG at the ends and CCCT[AG]AA at the fronts
startTare1Pats = c(
"^CCCT[AG]AA$",
"^CCT[AG]AAC$",
"^CT[AG]AACC$",
"^T[AG]AACCC$",
"^[AG]AACCCT$",
"^AACCCT[AG]$",
"^ACCCT[AG]A$"
)
endTare1Pats = c(
"^TT[TC]AGGG$",
"^T[TC]AGGGT$",
"^[TC]AGGGTT$",
"^AGGGTT[TC]$",
"^GGGTT[TC]A$",
"^GGTT[TC]AG$",
"^GTT[TC]AGG$"
)
startTare1Pats_14 = c(
"^CCCT[AG]AACCCT[AG]AA$",
"^CCT[AG]AACCCT[AG]AAC$",
"^CT[AG]AACCCT[AG]AACC$",
"^T[AG]AACCCT[AG]AACCC$",
"^[AG]AACCCT[AG]AACCCT$",
"^AACCCT[AG]AACCCT[AG]$",
"^ACCCT[AG]AACCCT[AG]A$"
)
endTare1Pats_14 = c(
"^TT[TC]AGGGTT[TC]AGGG$",
"^T[TC]AGGGTT[TC]AGGGT$",
"^[TC]AGGGTT[TC]AGGGTT$",
"^AGGGTT[TC]AGGGTT[TC]$",
"^GGGTT[TC]AGGGTT[TC]A$",
"^GGTT[TC]AGGGTT[TC]AG$",
"^GTT[TC]AGGGTT[TC]AGG$",
"^AGTAAGTAAGTAAG$"
)
allRepeats = tibble()
for(strain in strains$strains){
repeatFnp = paste0("surroundingRegionsMaterials/alltrfs/trf_", strain, "/repeats.bed")
if(file.exists(repeatFnp)){
repeats = readr::read_tsv(repeatFnp, col_names = F) %>%
separate(X4, sep = "__", into = c("name", "repeatInfo"), remove = F) %>%
separate(repeatInfo, sep = "_x", into = c("repeatUnit", "repeatNumber"), convert = T, remove = F) %>%
mutate(repeatUnitLen = nchar(repeatUnit))%>%
mutate(tare1 = (grepl(paste0(c(startTare1Pats, startTare1Pats_14), collapse = "|"), repeatUnit) |
grepl(paste0(c(endTare1Pats, endTare1Pats_14), collapse = "|"), repeatUnit))) %>%
mutate(SB3 = (repeatUnitLen == 21 | repeatUnitLen == 42) & X5 > 1000) %>%
mutate(strain = strain) %>%
left_join(
chromLengths %>%
rename(X1 = chrom,
chromLen = length)
)
allRepeats = bind_rows(
allRepeats,
repeats
)
}
}
allRepeats_filt_starts_ends = bind_rows(
allRepeats_filt_starts,
allRepeats_filt_ends
)%>%
mutate(repeatType = case_when(
tare1 ~ "TARE1",
SB3 ~ "SB3",
T ~ "other"
))
allRepeats_filt_starts_ends_tares = allRepeats_filt_starts_ends %>%
filter(repeatType != "other") %>%
mutate(genome = gsub("_.*", "", X1))
for(genomeName in unique(allRepeats_filt_starts_ends_tares$genome)){
write_tsv(allRepeats_filt_starts_ends_tares %>%
filter(genome == genomeName), paste0("surroundingRegionsMaterials/alltrfs/trf_", genomeName, "/", "allRepeats_filt_starts_ends_tares.tsv"))
}
write_tsv(allRepeats_filt_starts_ends_tares, paste0("surroundingRegionsMaterials/alltrfs/", "allRepeats_filt_starts_ends_tares.tsv"))
allRepeats_filt_starts_ends_tares = allRepeats_filt_starts_ends_tares %>%
mutate(start = X2,
end = X3,
chrom = X1)
In these genomes the chromosomes of 11 and 13’s peri-telomeric regions are not assembled well, likely due to the duplicated region on chr11 and chr13
mkdir -p surroundingRegionsMaterials/possibleFragmentsOf11
cd surroundingRegionsMaterials/possibleFragmentsOf11
elucidator fastaToBed --trimAtWhiteSpace --fasta /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/PfIT.fasta | egrep PfIT_00_10 | elucidator bed3ToBed6 --bed STDIN --out PfIT_00_10.bed --overWrite
elucidator gffToBedByBedLoc --extraAttributes description --feature gene,pseudogene --bed PfIT_00_10.bed --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/PfIT.gff --overWrite --out PfIT_00_10_genes.bed
elucidator fastaToBed --trimAtWhiteSpace --fasta /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/PfKH01.fasta | egrep PfKH01_00_27 | elucidator bed3ToBed6 --bed STDIN --out PfKH01_00_27.bed --overWrite
elucidator gffToBedByBedLoc --extraAttributes description --feature gene,pseudogene --bed PfKH01_00_27.bed --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/PfKH01.gff --overWrite --out PfKH01_00_27_genes.bed
for x in *genes.bed; do elucidator splitColumnContainingMeta --addHeader --delim tab --overWrite --removeEmptyColumn --column col.6 --file ${x} --out split_${x%%.bed}.tab.txt ; done;
all08 = tibble()
for(strain in strains$strains){
strain08 = readr::read_tsv(paste0("surroundingRegionsMaterials/endBeds/split_", strain, "_chrom08_toEnd_genes.tab.txt")) %>%
mutate(strain = strain ) %>%
mutate(chromGlobal = gsub(paste0(strain, "_"), "", col.0)) %>%
mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
rename(chrom = col.0,
start = col.1,
end = col.2)
all08 = bind_rows(all08, strain08)
}
all08 = all08 %>%
mutate(description = gsub("unknownfunction", "unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-like protein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-likeprotein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description)) %>%
mutate(description = gsub(",putative", ", putative", description)) %>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub("conserved protein, unknown function", "conserved Plasmodium protein, unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse(grepl("sporozoite and liver stage tryptophan-rich protein, putative", description), "tryptophan/threonine-rich antigen", description))%>%
mutate(description = ifelse(grepl("CRA domain-containing protein, putative", description), "conserved Plasmodium protein, unknown function", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description)) %>%
mutate(description = gsub("surfaceantigen", "surface antigen", description)) %>%
mutate(description = gsub("Tetratricopeptide repeat, putative", "tetratricopeptide repeat protein, putative", description)) %>%
mutate(description = gsub("transmembraneprotein", "transmembrane protein", description)) %>%
mutate(description = ifelse(grepl("PfEMP1", description) & grepl("pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("PIR protein", "stevor", description)) %>%
mutate(description = gsub("erythrocyte membrane protein 1-like", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("acidic terminal segments, variant surface antigen of PfEMP1, putative", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description))%>%
mutate(description = ifelse(grepl("CoA binding protein", description, ignore.case = T), "acyl-CoA binding protein", description)) %>%
mutate(description = ifelse(grepl("transfer RNA", description) | grepl("tRNA", description), "tRNA", description))%>%
mutate(description = ifelse(grepl("cytoadherence", description), "CLAG", description))%>%
mutate(description = ifelse(grepl("surface-associated interspersed protein", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("SURFIN", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("stevor-like", description), "stevor, pseudogene", description)) %>%
mutate(description = ifelse(grepl("exported protein family", description), "exported protein family", description)) %>%
mutate(description = ifelse(grepl("ribosomal RNA", description), "rRNA", description)) %>%
mutate(description = ifelse(grepl("serine/threonine protein kinase", description), "serine/threonine protein kinase, FIKK family", description)) %>%
mutate(description = ifelse(grepl("hypothetical protein", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("conserved Plasmodium protein, unknown function", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("Rifin/stevor family, putative", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("stevor", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("rifin", description), "rifin", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1 (PfEMP1), pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("probably protein", description), "unspecified product", description)) %>%
mutate(description = ifelse(grepl("RESA", description), "RESA", description)) %>%
mutate(description = ifelse(grepl("ring-infected erythrocyte surface antigen", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = ifelse(grepl("Duffy binding domain/Erythrocyte binding antigen175, putative", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("erythrocyte binding like protein 1", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("unspecified product", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("probable protein, unknown function", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("RESA", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = gsub(", putative", "", description)) %>%
mutate(description = ifelse(grepl("transcription factor with AP2 domain(s)", description, fixed = T), "AP2 domain transcription factor AP2-L", description)) %>%
mutate(description = ifelse(grepl("zinc finger, C3HC4 type", description, fixed = T), "RING zinc finger protein", description)) %>%
mutate(description = ifelse(grepl("eukaryotic translation initiation factor 2 subunit alpha", description, fixed = T), "eukaryotic translation initiation factor 2 alpha subunit", description))
all08_strainStarts = all08 %>%
group_by(strain, chrom) %>%
summarise(minStart = min(start))
chromLengths_08 = chromLengths %>%
filter(chrom %in% all08_strainStarts$chrom) %>%
left_join(all08_strainStarts)%>%
mutate(globalStart = 0,
globalEnd = length - minStart) %>%
mutate(chrom = factor(chrom, levels = chrom))
all08 = all08 %>%
left_join(all08_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart)
all08 = all08 %>%
mutate(chrom = factor(chrom, levels = chromLengths_08$chrom))
all13 = tibble()
for(strain in strains$strains){
strain13 = readr::read_tsv(paste0("surroundingRegionsMaterials/endBeds/split_", strain, "_chrom13_toEnd_genes.tab.txt")) %>%
mutate(strain = strain ) %>%
mutate(chromGlobal = gsub(paste0(strain, "_"), "", col.0)) %>%
mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
rename(chrom = col.0,
start = col.1,
end = col.2)
all13 = bind_rows(all13, strain13)
}
all13 = all13 %>%
filter(col.3 %!in% c("Pf3D7_13_v3_2794236_2794851_PF3D7_1370500", "Pf3D7_13_v3_2796118_2797144_PF3D7_1370800", "Pf3D7_13_v3_2797506_2798103_PF3D7_1370900")) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-like protein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-likeprotein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description)) %>%
mutate(description = gsub(",putative", ", putative", description)) %>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub("conserved protein, unknown function", "conserved Plasmodium protein, unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse(grepl("sporozoite and liver stage tryptophan-rich protein, putative", description), "tryptophan/threonine-rich antigen", description))%>%
mutate(description = ifelse(grepl("CRA domain-containing protein, putative", description), "conserved Plasmodium protein, unknown function", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description)) %>%
mutate(description = gsub("surfaceantigen", "surface antigen", description)) %>%
mutate(description = gsub("Tetratricopeptide repeat, putative", "tetratricopeptide repeat protein, putative", description)) %>%
mutate(description = gsub("transmembraneprotein", "transmembrane protein", description)) %>%
mutate(description = ifelse(grepl("PfEMP1", description) & grepl("pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("PIR protein", "stevor", description)) %>%
mutate(description = gsub("erythrocyte membrane protein 1-like", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("acidic terminal segments, variant surface antigen of PfEMP1, putative", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description))%>%
mutate(description = ifelse(grepl("CoA binding protein", description, ignore.case = T), "acyl-CoA binding protein", description)) %>%
mutate(description = ifelse(grepl("transfer RNA", description) | grepl("tRNA", description), "tRNA", description))%>%
mutate(description = ifelse(grepl("cytoadherence", description), "CLAG", description))%>%
mutate(description = ifelse(grepl("surface-associated interspersed protein", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("SURFIN", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("stevor-like", description), "stevor, pseudogene", description)) %>%
mutate(description = ifelse(grepl("exported protein family", description), "exported protein family", description)) %>%
mutate(description = ifelse(grepl("ribosomal RNA", description), "rRNA", description)) %>%
mutate(description = ifelse(grepl("serine/threonine protein kinase", description), "serine/threonine protein kinase, FIKK family", description)) %>%
mutate(description = ifelse(grepl("hypothetical protein", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("conserved Plasmodium protein, unknown function", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("Rifin/stevor family, putative", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("stevor", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("rifin", description), "rifin", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1 (PfEMP1), pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("probably protein", description), "unspecified product", description)) %>%
mutate(description = ifelse(grepl("RESA", description), "RESA", description)) %>%
mutate(description = ifelse(grepl("ring-infected erythrocyte surface antigen", description), "ring-infected erythrocyte surface antigen", description))%>%
mutate(description = ifelse(grepl("Duffy binding domain/Erythrocyte binding antigen175, putative", description), "erythrocyte binding like protein 1", description))%>%
mutate(description = ifelse(grepl("erythrocyte binding like protein 1", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("unspecified product", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("probable protein, unknown function", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("RESA", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = gsub(", putative", "", description)) %>%
mutate(description = ifelse(grepl("transcription factor with AP2 domain(s)", description, fixed = T), "AP2 domain transcription factor AP2-L", description)) %>%
mutate(description = ifelse(grepl("zinc finger, C3HC4 type", description, fixed = T), "RING zinc finger protein", description)) %>%
mutate(description = ifelse(grepl("eukaryotic translation initiation factor 2 subunit alpha", description, fixed = T), "eukaryotic translation initiation factor 2 alpha subunit", description))
all13_strainStarts = all13 %>%
group_by(strain, chrom) %>%
summarise(minStart = min(start))
all13 = all13 %>%
left_join(all13_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart)
chromLengths_13 = chromLengths %>%
filter(chrom %in% all13_strainStarts$chrom) %>%
left_join(all13_strainStarts)%>%
mutate(globalStart = 0,
globalEnd = length - minStart) %>%
mutate(chrom = factor(chrom, levels = chrom))
all13 = all13 %>%
mutate(chrom = factor(chrom, levels = chromLengths_13$chrom))
all11 = tibble()
for(strain in strains$strains){
strain11 = readr::read_tsv(paste0("surroundingRegionsMaterials/endBeds/split_", strain, "_chrom11_toEnd_genes.tab.txt")) %>%
mutate(strain = strain ) %>%
mutate(chromGlobal = gsub(paste0(strain, "_"), "", col.0)) %>%
mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
rename(chrom = col.0,
start = col.1,
end = col.2)
all11 = bind_rows(all11, strain11)
}
# PfIT_00_10 = readr::read_tsv(paste0("surroundingRegionsMaterials/possibleFragmentsOf11/split_PfIT_00_10_genes.tab.txt")) %>%
# mutate(strain = "PfIT" ) %>%
# mutate(chromGlobal = gsub(paste0("PfIT", "_"), "", col.0)) %>%
# mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
# rename(chrom = col.0,
# start = col.1,
# end = col.2)
# all11 = bind_rows(all11, PfIT_00_10)
PfKH01_00_27 = readr::read_tsv(paste0("surroundingRegionsMaterials/possibleFragmentsOf11/split_PfKH01_00_27_genes.tab.txt")) %>%
mutate(strain = "PfKH01" ) %>%
mutate(chromGlobal = gsub(paste0("PfKH01", "_"), "", col.0)) %>%
mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
rename(chrom = col.0,
start = col.1,
end = col.2)
# %>%
# mutate(start = abs(start - 160672))%>%
# mutate(end = abs(end - 160672))
all11 = bind_rows(all11, PfKH01_00_27)
all11 = all11 %>%
filter(col.3 %!in% c("Pf3D7_11_v3_1916391_1918050_PF3D7_1148100", "Pf3D7_11_v3_1920267_1920811_PF3D7_1148200", "Pf3D7_11_v3_1922655_1922928_PF3D7_1148500")) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-like protein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-likeprotein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description)) %>%
mutate(description = gsub(",putative", ", putative", description)) %>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub("conserved protein, unknown function", "conserved Plasmodium protein, unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse(grepl("sporozoite and liver stage tryptophan-rich protein, putative", description), "tryptophan/threonine-rich antigen", description))%>%
mutate(description = ifelse(grepl("CRA domain-containing protein, putative", description), "conserved Plasmodium protein, unknown function", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description)) %>%
mutate(description = gsub("surfaceantigen", "surface antigen", description)) %>%
mutate(description = gsub("Tetratricopeptide repeat, putative", "tetratricopeptide repeat protein, putative", description)) %>%
mutate(description = gsub("transmembraneprotein", "transmembrane protein", description)) %>%
mutate(description = ifelse(grepl("PfEMP1", description) & grepl("pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("PIR protein", "stevor", description)) %>%
mutate(description = gsub("erythrocyte membrane protein 1-like", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("acidic terminal segments, variant surface antigen of PfEMP1, putative", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description))%>%
mutate(description = ifelse(grepl("CoA binding protein", description, ignore.case = T), "acyl-CoA binding protein", description)) %>%
mutate(description = ifelse(grepl("transfer RNA", description) | grepl("tRNA", description), "tRNA", description))%>%
mutate(description = ifelse(grepl("cytoadherence", description), "CLAG", description))%>%
mutate(description = ifelse(grepl("surface-associated interspersed protein", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("SURFIN", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("stevor-like", description), "stevor, pseudogene", description)) %>%
mutate(description = ifelse(grepl("exported protein family", description), "exported protein family", description)) %>%
mutate(description = ifelse(grepl("ribosomal RNA", description), "rRNA", description)) %>%
mutate(description = ifelse(grepl("serine/threonine protein kinase", description), "serine/threonine protein kinase, FIKK family", description)) %>%
mutate(description = ifelse(grepl("hypothetical protein", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("conserved Plasmodium protein, unknown function", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("Rifin/stevor family, putative", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("stevor", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("rifin", description), "rifin", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1 (PfEMP1), pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("probably protein", description), "unspecified product", description)) %>%
mutate(description = ifelse(grepl("RESA", description), "RESA", description)) %>%
mutate(description = ifelse(grepl("ring-infected erythrocyte surface antigen", description), "ring-infected erythrocyte surface antigen", description))%>%
mutate(description = ifelse(grepl("Duffy binding domain/Erythrocyte binding antigen175, putative", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("erythrocyte binding like protein 1", description), "erythrocyte binding like protein 1", description))%>%
mutate(description = ifelse(grepl("unspecified product", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("probable protein, unknown function", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("RESA", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = gsub(", putative", "", description)) %>%
mutate(description = ifelse(grepl("transcription factor with AP2 domain(s)", description, fixed = T), "AP2 domain transcription factor AP2-L", description)) %>%
mutate(description = ifelse(grepl("zinc finger, C3HC4 type", description, fixed = T), "RING zinc finger protein", description)) %>%
mutate(description = ifelse(grepl("eukaryotic translation initiation factor 2 subunit alpha", description, fixed = T), "eukaryotic translation initiation factor 2 alpha subunit", description))
all11_strainStarts = all11 %>%
group_by(strain, chrom) %>%
summarise(minStart = min(start))
all11 = all11 %>%
left_join(all11_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart)
chromLengths_11 = chromLengths %>%
filter(chrom %in% all11_strainStarts$chrom) %>%
left_join(all11_strainStarts) %>%
mutate(globalStart = 0,
globalEnd = length - minStart) %>%
mutate(chrom = factor(chrom, levels = chrom))
all11 = all11 %>%
mutate(chrom = factor(chrom, levels = chromLengths_11$chrom)) %>%
mutate(globalStart = ifelse("PfKH01_00_27" == chrom, globalStart + 17822, globalStart))%>%
mutate(globalEnd = ifelse("PfKH01_00_27" == chrom, globalEnd + 17822, globalEnd))
# %>%
# mutate(globalStart = ifelse("PfIT_00_10" == chrom, globalStart + 28623, globalStart))%>%
# mutate(globalEnd = ifelse("PfIT_00_10" == chrom, globalEnd + 28623 , globalEnd))
chromLengths_11 = chromLengths_11 %>%
mutate(globalStart = ifelse("PfKH01_00_27" == chrom, globalStart + 17822, globalStart))%>%
mutate(globalEnd = ifelse("PfKH01_00_27" == chrom, globalEnd + 17822, globalEnd))
# %>%
# mutate(globalStart = ifelse("PfIT_00_10" == chrom, globalStart + 28623, globalStart))%>%
# mutate(globalEnd = ifelse("PfIT_00_10" == chrom, globalEnd + 28623, globalEnd))
descriptionColorsNames = unique(c(all08$description, all13$description, all11$description))
rawDescriptionColors = scheme$hex(length(descriptionColorsNames))
names(rawDescriptionColors) = descriptionColorsNames
genomicElementsColors = c()
genomicElementsColors["TARE1"] = "#F748A5"
genomicElementsColors["SB3"] = "#F0E442"
rawGeneAnnotationsColors = rawDescriptionColors
descriptionColors = c(rawDescriptionColors, genomicElementsColors)
geneAnnotationsColors = descriptionColors
allRepeats_filt_starts_ends_tares_chrom08 = allRepeats_filt_starts_ends_tares %>%
filter(grepl("_08", X1)) %>%
left_join(all08_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart) %>%
filter(globalStart > 0)
allRepeats_filt_starts_ends_tares_chrom08 = allRepeats_filt_starts_ends_tares_chrom08 %>%
mutate(chrom = factor(chrom, levels = chromLengths_08$chrom))
chrom08_plot = ggplot() +
geom_rect(data = chromLengths_08, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4), fill= "grey91") +
geom_rect(data = all08, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4,
fill = description, ID = ID, strain = strain), color = "black") +
geom_rect(data = allRepeats_filt_starts_ends_tares_chrom08, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.2,
ymax = as.numeric(chrom) + 0.2,
fill = repeatType,
color = repeatType, strain = strain)
# , color = "black"
) +
scale_y_continuous(breaks = 1:max(as.numeric(chromLengths_08$chrom)), labels = as.character(chromLengths_08$chrom)) +
scale_fill_manual(values = descriptionColors[names(descriptionColors) %in% c(all08$description, allRepeats_filt_starts_ends_tares_chrom08$repeatType)]) +
scale_color_manual(values = descriptionColors[names(descriptionColors) %in% c(all08$description, allRepeats_filt_starts_ends_tares_chrom08$repeatType)]) +
sofonias_theme_xRotate
current_geneAnnotationsColors = geneAnnotationsColors[names(geneAnnotationsColors) %in% all08$description]
current_geneAnnotationsColors = sort(current_geneAnnotationsColors)
#cairo_pdf("chrom08_plot.pdf", height = 12, width = 20)
pdf("chrom08_plot.pdf", height = 20, width = 25, useDingbats = F)
print(chrom08_plot +
scale_fill_manual(
"Genes\nDescription",
values = geneAnnotationsColors,
breaks = names(current_geneAnnotationsColors),
labels = names(current_geneAnnotationsColors),
guide = guide_legend(
override.aes = list(
color = current_geneAnnotationsColors,
fill = current_geneAnnotationsColors,
alpha = 1,
shape = 22,
size = 5
),
nrow = 7,
order = 1
)
) +
scale_color_manual(
"Genomic\nElements",
values = alpha(geneAnnotationsColors, 0),
# values = c("black", "black"),
breaks = names(genomicElementsColors),
labels = names(genomicElementsColors),
guide = guide_legend(
override.aes = list(
color = genomicElementsColors,
fill = genomicElementsColors,
alpha = 1,
shape = 22,
size = 5
),
ncol = 1,
order = 2
)
) +
theme(legend.text = element_text(size = 30), axis.text.x = element_text(size = 30, color = "black"), axis.text.y = element_text(size = 30, color = "black"),
legend.title = element_text(size = 30, face = "bold"),
legend.box="vertical", legend.margin=margin(),
legend.background = element_blank(),
legend.box.background = element_rect(colour = "black"))
)
dev.off()
quartz_off_screen
2
plotly::ggplotly(chrom08_plot)
allRepeats_filt_starts_ends_tares_chrom11 = allRepeats_filt_starts_ends_tares %>%
filter(grepl("_11", X1)) %>%
left_join(all11_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart) %>%
filter(globalStart > 0)
allRepeats_filt_starts_ends_tares_chrom11 = allRepeats_filt_starts_ends_tares_chrom11 %>%
mutate(chrom = factor(chrom, levels = chromLengths_11$chrom))
chrom11_plot = ggplot() +
geom_rect(data = chromLengths_11, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4), fill= "grey91") +
geom_rect(data = all11, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4,
fill = description, ID = ID, strain = strain), color = "black") +
geom_rect(data = allRepeats_filt_starts_ends_tares_chrom11, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.2,
ymax = as.numeric(chrom) + 0.2,
fill = repeatType,
color = repeatType, strain = strain)
# , color = "black"
) +
scale_y_continuous(breaks = 1:max(as.numeric(chromLengths_11$chrom)), labels = as.character(chromLengths_11$chrom)) +
scale_fill_manual(values = descriptionColors[names(descriptionColors) %in% c(all11$description, allRepeats_filt_starts_ends_tares_chrom11$repeatType)]) +
scale_color_manual(values = descriptionColors[names(descriptionColors) %in% c(all11$description, allRepeats_filt_starts_ends_tares_chrom11$repeatType)]) +
sofonias_theme_xRotate
current_geneAnnotationsColors = geneAnnotationsColors[names(geneAnnotationsColors) %in% all11$description]
current_geneAnnotationsColors = sort(current_geneAnnotationsColors)
#cairo_pdf("chrom11_plot.pdf", height = 12, width = 20)
pdf("chrom11_plot.pdf", height = 20, width = 25, useDingbats = F)
print(chrom11_plot +
scale_fill_manual(
"Genes\nDescription",
values = geneAnnotationsColors,
breaks = names(current_geneAnnotationsColors),
labels = names(current_geneAnnotationsColors),
guide = guide_legend(
override.aes = list(
color = current_geneAnnotationsColors,
fill = current_geneAnnotationsColors,
alpha = 1,
shape = 22,
size = 5
),
nrow = 7,
order = 1
)
) +
scale_color_manual(
"Genomic\nElements",
values = alpha(geneAnnotationsColors, 0),
# values = c("black", "black"),
breaks = names(genomicElementsColors),
labels = names(genomicElementsColors),
guide = guide_legend(
override.aes = list(
color = genomicElementsColors,
fill = genomicElementsColors,
alpha = 1,
shape = 22,
size = 5
),
ncol = 1,
order = 2
)
) +
theme(legend.text = element_text(size = 30), axis.text.x = element_text(size = 30, color = "black"), axis.text.y = element_text(size = 30, color = "black"),
legend.title = element_text(size = 30, face = "bold"),
legend.box="vertical", legend.margin=margin(),
legend.background = element_blank(),
legend.box.background = element_rect(colour = "black"))
)
dev.off()
quartz_off_screen
2
plotly::ggplotly(chrom11_plot)
allRepeats_filt_starts_ends_tares_chrom13 = allRepeats_filt_starts_ends_tares %>%
filter(grepl("_13", X1)) %>%
left_join(all13_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart) %>%
filter(globalStart > 0)
allRepeats_filt_starts_ends_tares_chrom13 = allRepeats_filt_starts_ends_tares_chrom13 %>%
mutate(chrom = factor(chrom, levels = chromLengths_13$chrom))
chrom13_plot = ggplot() +
geom_rect(data = chromLengths_13, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4), fill= "grey91") +
geom_rect(data = all13, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4,
fill = description, ID = ID, strain = strain), color = "black") +
geom_rect(data = allRepeats_filt_starts_ends_tares_chrom13, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.2,
ymax = as.numeric(chrom) + 0.2,
fill = repeatType,
color = repeatType, strain = strain)
# , color = "black"
) +
scale_y_continuous(breaks = 1:max(as.numeric(chromLengths_13$chrom)), labels = as.character(chromLengths_13$chrom)) +
scale_fill_manual(values = descriptionColors[names(descriptionColors) %in% c(all13$description, allRepeats_filt_starts_ends_tares_chrom13$repeatType)]) +
scale_color_manual(values = descriptionColors[names(descriptionColors) %in% c(all13$description, allRepeats_filt_starts_ends_tares_chrom13$repeatType)]) +
sofonias_theme_xRotate
current_geneAnnotationsColors = geneAnnotationsColors[names(geneAnnotationsColors) %in% all13$description]
current_geneAnnotationsColors = sort(current_geneAnnotationsColors)
#cairo_pdf("chrom13_plot.pdf", height = 12, width = 20)
pdf("chrom13_plot.pdf", height = 20, width = 25, useDingbats = F)
print(chrom13_plot +
scale_fill_manual(
"Genes\nDescription",
values = geneAnnotationsColors,
breaks = names(current_geneAnnotationsColors),
labels = names(current_geneAnnotationsColors),
guide = guide_legend(
override.aes = list(
color = current_geneAnnotationsColors,
fill = current_geneAnnotationsColors,
alpha = 1,
shape = 22,
size = 5
),
nrow = 9,
order = 1
)
) +
scale_color_manual(
"Genomic\nElements",
values = alpha(geneAnnotationsColors, 0),
# values = c("black", "black"),
breaks = names(genomicElementsColors),
labels = names(genomicElementsColors),
guide = guide_legend(
override.aes = list(
color = genomicElementsColors,
fill = genomicElementsColors,
alpha = 1,
shape = 22,
size = 5
),
ncol = 1,
order = 2
)
) +
theme(legend.text = element_text(size = 30), axis.text.x = element_text(size = 30, color = "black"), axis.text.y = element_text(size = 30, color = "black"),
legend.title = element_text(size = 30, face = "bold"),
legend.box="vertical", legend.margin=margin(),
legend.background = element_blank(),
legend.box.background = element_rect(colour = "black"))
)
dev.off()
quartz_off_screen
2
plotly::ggplotly(chrom13_plot)
Determining how much of the core genome are involved in the re-arrangement between chromosomes 11 and 13, using only strains that do not have evidence of these re-arrangements
selectStrains = c("Pf3D7","Pf7G8","PfCD01","PfDd2","PfGA01","PfGB4","PfGN01","PfKE01","PfKH02","PfSN01")
allRepeats_filt_starts_ends_tares_chrom11_sel_tare1 = allRepeats_filt_starts_ends_tares_chrom11 %>%
filter(strain %in% selectStrains) %>%
filter(tare1)
chromLengths_11_sel = chromLengths_11 %>%
filter(strain %in% selectStrains)
all11_sel = all11 %>%
filter(strain %in% selectStrains)
all11_sel_rRNA = all11_sel %>%
filter(description %in% c("rRNA")) %>%
group_by(strain) %>%
slice_max(end)
all11_sel_DnaJ = all11_sel %>%
filter(grepl("DnaJ", description ))
chromLengths_11_sel_full = chromLengths_11_sel %>%
select(-minStart,-globalStart,-globalEnd) %>%
filter(strain %in% allRepeats_filt_starts_ends_tares_chrom11_sel_tare1$strain)
chromLengths_11_sel_full = chromLengths_11_sel_full %>%
left_join(all11_sel_rRNA %>%
select(strain, chrom, end) %>%
rename(duplicatedRegionEnd = end)) %>%
left_join(all11_sel_DnaJ%>%
select(strain, chrom, end) %>%
rename(coreRegionEnd = end))
chromLengths_11_sel_full = chromLengths_11_sel_full %>%
mutate(coreRegionDuplicated = coreRegionEnd - duplicatedRegionEnd,
additionalParalogousRegion = length - coreRegionEnd)
create_dt(chromLengths_11_sel_full)
skimr::skim(chromLengths_11_sel_full)
Name | chromLengths_11_sel_full |
Number of rows | 10 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
character | 1 |
factor | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
strain | 0 | 1 | 5 | 6 | 0 | 10 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
chrom | 0 | 1 | FALSE | 10 | Pf3: 1, Pf7: 1, PfC: 1, PfD: 1 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
length | 0 | 1 | 2026568 | 47885.59 | 1964163 | 1989834.50 | 2036634 | 2039867.75 | 2133954 | ▆▁▇▁▂ |
duplicatedRegionEnd | 0 | 1 | 1905321 | 40975.27 | 1845730 | 1879163.50 | 1906840 | 1932135.25 | 1976854 | ▅▂▇▇▂ |
coreRegionEnd | 0 | 1 | 1975401 | 41042.49 | 1915924 | 1949238.50 | 1976922 | 2002354.00 | 2046992 | ▅▂▇▇▂ |
coreRegionDuplicated | 0 | 1 | 70080 | 175.22 | 69689 | 70048.75 | 70104 | 70165.75 | 70350 | ▂▂▆▇▂ |
additionalParalogousRegion | 0 | 1 | 51167 | 25204.69 | 15391 | 35132.75 | 51375 | 69749.75 | 86962 | ▅▇▁▇▅ |
allRepeats_filt_starts_ends_tares_chrom13_sel_tare1 = allRepeats_filt_starts_ends_tares_chrom13 %>%
filter(strain %in% selectStrains) %>%
filter(tare1)
chromLengths_13_sel = chromLengths_13 %>%
filter(strain %in% selectStrains)
all13_sel = all13 %>%
filter(strain %in% selectStrains)
all13_sel_rRNA = all13_sel %>%
filter(description %in% c("rRNA")) %>%
group_by(strain) %>%
slice_max(end)
all13_sel_acylCoA = all13_sel %>%
filter(grepl("acyl-CoA", description ))
chromLengths_13_sel_full = chromLengths_13_sel %>%
select(-minStart,-globalStart,-globalEnd) %>%
filter(strain %in% allRepeats_filt_starts_ends_tares_chrom13_sel_tare1$strain)
chromLengths_13_sel_full = chromLengths_13_sel_full %>%
left_join(all13_sel_rRNA %>%
select(strain, chrom, end) %>%
rename(duplicatedRegionEnd = end)) %>%
left_join(all13_sel_acylCoA%>%
select(strain, chrom, end) %>%
rename(coreRegionEnd = end))
chromLengths_13_sel_full = chromLengths_13_sel_full %>%
mutate(coreRegionDeleted = coreRegionEnd - duplicatedRegionEnd,
additionalParalogousRegion = length - coreRegionEnd)
create_dt(chromLengths_13_sel_full)
skimr::skim(chromLengths_13_sel_full)
Name | chromLengths_13_sel_full |
Number of rows | 8 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
character | 1 |
factor | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
strain | 0 | 1 | 5 | 6 | 0 | 8 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
chrom | 0 | 1 | FALSE | 8 | Pf3: 1, Pf7: 1, PfC: 1, PfD: 1 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
length | 0 | 1 | 2886142.12 | 78683.24 | 2753963 | 2823319.25 | 2923910 | 2934451.50 | 2977552 | ▂▅▁▇▅ |
duplicatedRegionEnd | 0 | 1 | 2784894.88 | 64610.49 | 2689900 | 2741810.75 | 2808584 | 2825068.50 | 2865317 | ▅▂▁▇▅ |
coreRegionEnd | 0 | 1 | 2831316.12 | 64716.56 | 2736274 | 2787976.00 | 2855123 | 2871514.25 | 2911880 | ▅▂▁▇▅ |
coreRegionDeleted | 0 | 1 | 46421.25 | 193.54 | 46127 | 46312.25 | 46396 | 46537.50 | 46756 | ▂▇▂▅▂ |
additionalParalogousRegion | 0 | 1 | 54826.00 | 23647.87 | 17689 | 39175.00 | 64312 | 68231.25 | 84736 | ▅▂▁▇▅ |
all11_rRNAEnd_3D7 = all11 %>%
filter(col.3 == "Pf3D7_11_v3_1928933_1933138_PF3D7_1148640")
all11_DNAJ = all11 %>%
filter(col.3 == "Pf3D7_11_v3_2001067_2003313_PF3D7_1149600")
chrom11CoreLen = all11_DNAJ$end - all11_rRNAEnd_3D7$end[1]
all13_rRNA = all13 %>%
filter(description %in% c("rRNA"))
all13_rRNAEnd_3D7 = all13 %>%
filter(col.3 == "Pf3D7_13_v3_2802944_2807159_PF3D7_1371300")
all13_acylCoA = all13 %>%
filter(col.3 == "Pf3D7_13_v3_2850890_2853482_PF3D7_1372400")
chrom13CoreLen = all13_acylCoA$end - all13_rRNAEnd_3D7$end[1]
create_dt(tibble(
genomeCore = c("chrom11CoreLen", "chrom13CoreLen"),
lens = c(chrom11CoreLen, chrom13CoreLen)
))
Getting the region surrounding the duplicate region of pfmdr1 on chromosome 5 associated with chr13/pfhrp3 deletion
pfmdr1_windows = readr::read_tsv("../windowAnalysis/windows/stableWindows_05_regionOfInterestNarrow.bed", col_names = F)
pfmdr1_windows_largeWindow = pfmdr1_windows %>%
group_by(X1) %>%
summarise(X2 = min(X2),
X3 = max(X3)) %>%
mutate(name = paste0(X1, "-", X2, "-", X3)) %>%
mutate(len = X3 - X2,
strand = "+")
write_tsv(pfmdr1_windows_largeWindow, col_names = F,file = "surroundingRegionsMaterials/pfmdr1_region/pfmdr1_windows_largeWindow.bed")
pfmdr1_windows_geneIds = pfmdr1_windows %>%
mutate(geneID = gsub("[ID=", "", X7, fixed = T)) %>%
mutate(geneID = gsub(";.*", "", geneID)) %>%
select(geneID) %>%
filter(!is.na(geneID)) %>%
arrange() %>%
unique()
write_tsv(pfmdr1_windows[1,], col_names = F, file = "surroundingRegionsMaterials/pfmdr1_region/firstRegion.bed")
cat(pfmdr1_windows_geneIds$geneID, sep = "\n", file = "surroundingRegionsMaterials/pfmdr1_region/geneIDs.txt")
cd surroundingRegionsMaterials/pfmdr1_region
elucidator gffRecordIDToGeneInfo --id geneIDs.txt --dout pfmdr1_region_geneInfos --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/Pf3D7.gff --2bit /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf3D7.2bit --overWriteDir
elucidator extractRefSeqsFromGenomes --bed firstRegion.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir firstRegion_pfmdr1_region --overWriteDirs --numThreads 7 --extendAndTrim --coverage 95 --keepBestOnly --identity 90
elucidator extractRefSeqsFromGenomes --bed pfmdr1_region_geneInfos/out_PF3D7_0523700.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir lastGene_pfmdr1_region --overWriteDirs --numThreads 7 --extendAndTrim --coverage 95 --keepBestOnly --identity 90
mkdir regionBeds
strains = readr::read_tsv("surroundingRegionsMaterials/strains.txt", col_names = c("strains"))
all_region_around_mdr1 = tibble()
for(strain in strains$strains){
firstRegion = readr::read_tsv(paste0("surroundingRegionsMaterials/pfmdr1_region/firstRegion_pfmdr1_region/Pf3D7_05_v3-929384-929984-for/beds/", strain, "_bestRegion.bed"), col_names = F)
lastRegion = readr::read_tsv(paste0("surroundingRegionsMaterials/pfmdr1_region/lastGene_pfmdr1_region/Pf3D7_05_v3-980753-985354-rev/beds/", strain, "_bestRegion.bed"), col_names = F)
region_around_mdr1 = tibble(
X1 = firstRegion$X1[1],
X2 = firstRegion$X2[1],
X3 = lastRegion$X3[1]
) %>%
mutate(name = paste0(X1, "-", X2, "-", X3),
len = X3 - X2,
strand = "+")
all_region_around_mdr1 = bind_rows(
all_region_around_mdr1,
region_around_mdr1 %>%
mutate(strain = strain)
)
write_tsv(region_around_mdr1, col_names = F, file = paste0("surroundingRegionsMaterials/pfmdr1_region/regionBeds/", strain, "_pfmdr1Region.bed"))
}
cd surroundingRegionsMaterials/pfmdr1_region/regionBeds
for x in *pfmdr1Region.bed; do elucidator gffToBedByBedLoc --extraAttributes description --feature gene,pseudogene --bed ${x} --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/$(echo $x | sed 's/_.*//g').gff --overWrite --out ${x%%.bed}_genes.bed; done;
for x in *pfmdr1Region.bed; do elucidator gffToBedByBedLoc --extraAttributes description --bed ${x} --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/$(echo $x | sed 's/_.*//g').gff --overWrite --out ${x%%.bed}_all.bed; done;
for x in *pfmdr1Region_genes.bed; do elucidator splitColumnContainingMeta --addHeader --delim tab --overWrite --removeEmptyColumn --column col.6 --file ${x} --out split_${x%%.bed}.tab.txt ; done;
all05 = tibble()
for(strain in strains$strains){
strain05 = readr::read_tsv(paste0("surroundingRegionsMaterials/pfmdr1_region/regionBeds/split_", strain, "_pfmdr1Region_genes.tab.txt")) %>%
mutate(strain = strain ) %>%
mutate(chromGlobal = gsub(paste0(strain, "_"), "", col.0)) %>%
mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
rename(chrom = col.0,
start = col.1,
end = col.2)
all05 = bind_rows(all05, strain05)
}
all05 = all05 %>%
mutate(description = gsub("unknownfunction", "unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-like protein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-likeprotein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description)) %>%
mutate(description = gsub(",putative", ", putative", description)) %>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub("conserved protein, unknown function", "conserved Plasmodium protein, unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse(grepl("sporozoite and liver stage tryptophan-rich protein, putative", description), "tryptophan/threonine-rich antigen", description))%>%
mutate(description = ifelse(grepl("CRA domain-containing protein, putative", description), "conserved Plasmodium protein, unknown function", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description)) %>%
mutate(description = gsub("surfaceantigen", "surface antigen", description)) %>%
mutate(description = gsub("Tetratricopeptide repeat, putative", "tetratricopeptide repeat protein, putative", description)) %>%
mutate(description = gsub("transmembraneprotein", "transmembrane protein", description)) %>%
mutate(description = ifelse(grepl("PfEMP1", description) & grepl("pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("PIR protein", "stevor", description)) %>%
mutate(description = gsub("erythrocyte membrane protein 1-like", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("acidic terminal segments, variant surface antigen of PfEMP1, putative", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = ifelse(grepl("CoA binding protein", description, ignore.case = T), "acyl-CoA binding protein", description)) %>%
mutate(description = ifelse(grepl("transfer RNA", description) | grepl("tRNA", description), "tRNA", description))%>%
mutate(description = ifelse(grepl("cytoadherence", description), "CLAG", description))%>%
mutate(description = ifelse(grepl("surface-associated interspersed protein", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("SURFIN", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("stevor-like", description), "stevor, pseudogene", description)) %>%
mutate(description = ifelse(grepl("exported protein family", description), "exported protein family", description)) %>%
mutate(description = ifelse(grepl("ribosomal RNA", description), "rRNA", description)) %>%
mutate(description = ifelse(grepl("serine/threonine protein kinase", description), "serine/threonine protein kinase, FIKK family", description)) %>%
mutate(description = ifelse(grepl("hypothetical protein", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("conserved Plasmodium protein, unknown function", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("Rifin/stevor family, putative", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("stevor", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("rifin", description), "rifin", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1 (PfEMP1), pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("probably protein", description), "unspecified product", description)) %>%
mutate(description = ifelse(grepl("RESA", description), "RESA", description)) %>%
mutate(description = ifelse(grepl("ring-infected erythrocyte surface antigen", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = ifelse(grepl("Duffy binding domain/Erythrocyte binding antigen175, putative", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("erythrocyte binding like protein 1", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("unspecified product", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("probable protein, unknown function", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("RESA", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = gsub("iron-sulfur assembly protein, putative", "iron-sulfur assembly protein", description)) %>%
mutate(description = gsub("mitochondrial processing peptidase alphasubunit, putative", "mitochondrial-processing peptidase subunit alpha, putative", description)) %>%
mutate(description = gsub(", putative", "", description)) %>%
mutate(description = ifelse(grepl("transcription factor with AP2 domain(s)", description, fixed = T), "AP2 domain transcription factor AP2-L", description)) %>%
mutate(description = ifelse(grepl("zinc finger, C3HC4 type", description, fixed = T), "RING zinc finger protein", description)) %>%
mutate(description = ifelse(grepl("eukaryotic translation initiation factor 2 subunit alpha", description, fixed = T), "eukaryotic translation initiation factor 2 alpha subunit", description))
all05_strainStarts = all05 %>%
group_by(strain, chrom) %>%
summarise(minStart = min(start))
chromLengths_05 = all_region_around_mdr1 %>%
select(X1, X3, strain) %>%
rename(chrom = X1,
end = X3) %>%
left_join(all05_strainStarts)%>%
mutate(globalStart = 0,
globalEnd = end - minStart) %>%
mutate(chrom = factor(chrom, levels = chrom))
all05 = all05 %>%
left_join(all05_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart)
all05 = all05 %>%
mutate(chrom = factor(chrom, levels = chromLengths_05$chrom))
chrom05_plot = ggplot() +
geom_rect(data = chromLengths_05, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4), fill= "grey91") +
geom_rect(data = all05, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4,
fill = description, ID = ID, strain = strain), color = "black") +
scale_y_continuous(breaks = 1:max(as.numeric(chromLengths_05$chrom)), labels = as.character(chromLengths_05$chrom)) +
scale_fill_manual(values = descriptionColors[names(descriptionColors) %in% c(all05$description)]) +
scale_color_manual(values = descriptionColors[names(descriptionColors) %in% c(all05$description)]) +
sofonias_theme_xRotate
current_geneAnnotationsColors = rawGeneAnnotationsColors[names(rawGeneAnnotationsColors) %in% all05$description]
current_geneAnnotationsColors = sort(current_geneAnnotationsColors)
#cairo_pdf("chrom05_plot.pdf", height = 12, width = 20)
pdf("chrom05_plot.pdf", height = 12, width = 20, useDingbats = F)
print(chrom05_plot +
scale_fill_manual(
"Genes\nDescription",
values = descriptionColors,
breaks = names(descriptionColors),
labels = names(descriptionColors),
guide = guide_legend(
override.aes = list(
color = descriptionColors,
fill = descriptionColors,
alpha = 1,
shape = 22,
size = 5
),
nrow = 3,
order = 1
)
)
)
dev.off()
quartz_off_screen
2
plotly::ggplotly(chrom05_plot)
---
title: Extracting Locations interest in all genomes
---
```{r setup, echo=FALSE, message=FALSE}
source("../common.R")
```
Below maps out the annotations of genes on the [pacbio assembled genomes from wellcome trust](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5964635/)[@Otto2018-bb]
Which can be downloaded via:
```{bash, eval = F}
wget https://seekdeep.brown.edu/data/plasmodiumData/pf.tar.gz
tar -zxvf pf.tar.gz
wget https://seekdeep.brown.edu/data/plasmodiumData/pf_plusPfSD01.tar.gz
tar -zxvf pf_plusPfSD01.tar.gz
```
```{bash, eval = F}
cd surroundingRegionsMaterials
elucidator gffRecordIDToGeneInfo --id PF3D7_0831800,PF3D7_1372200 --dout hrps_geneInfos --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/Pf3D7.gff --2bit /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf3D7.2bit --overWriteDir
elucidator extractRefSeqsFromGenomes --bed hrps_geneInfos/out_PF3D7_0831800.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedHRPII --overWriteDirs --numThreads 7 --extendAndTrim --coverage 95 --keepBestOnly --identity 90
elucidator extractRefSeqsFromGenomes --bed hrps_geneInfos/out_PF3D7_1372200.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedHRPIII --overWriteDirs --numThreads 7 --extendAndTrim --coverage 95 --keepBestOnly --identity 90
## create hmm models
muscle -in extractedHRPII/Pf3D7_08_v3-1374235-1375299-rev/allRefs.fasta -out extractedHRPII/Pf3D7_08_v3-1374235-1375299-rev/aln_allRefs.fasta
hmmbuild --symfrac 0.25 -n PfHRP2 hmm_PfHRP2.txt extractedHRPII/Pf3D7_08_v3-1374235-1375299-rev/aln_allRefs.fasta
muscle -in extractedHRPIII/Pf3D7_13_v3-2840726-2841703-rev/allRefs.fasta -out extractedHRPIII/Pf3D7_13_v3-2840726-2841703-rev/aln_allRefs.fasta
hmmbuild --symfrac 0.25 -n PfHRP3 hmm_PfHRP3.txt extractedHRPIII/Pf3D7_13_v3-2840726-2841703-rev/aln_allRefs.fasta
cat hmm_PfHRP2.txt hmm_PfHRP3.txt > hmms_PfHRPs.txt
rm -f searchRefGenomesForHmmsCmds.txt && for x in /tank/data/genomes/plasmodium/genomes/plasmodiumRefGenomes/genomes/P*.fasta; do echo elucidator runnhmmscan --hmmModel hmms_PfHRPs.txt --fasta ${x} --overWriteDir --trimAtWhiteSpace --dout hmm_hrps_against$(basename ${x%%.fasta}) --defaultParameters=\"--nonull2 --incT 50 --incdomT 50 -T 50 --notextw --cpu 4\" --hardEvalueCutOff 1e-50 >> searchRefGenomesForHmmsCmds.txt ; done;
nohup elucidator runMultipleCommands --cmdFile searchRefGenomesForHmmsCmds.txt --numThreads 4 --raw &
```
# Extracting from ends of chrom 8, 11, 13
Genes were chosen to include far upstream of the regions of interest on chromosomes 8 ([PF3D7_0830300](https://plasmodb.org/plasmo/app/record/gene/PF3D7_0830300)), 11 ([PF3D7_1147700](https://plasmodb.org/plasmo/app/record/gene/PF3D7_1147700)) and 13 ([PF3D7_1369500](https://plasmodb.org/plasmo/app/record/gene/PF3D7_1369500)) to investigate from there to the ends of the chromosomes
```{bash, eval = F}
/bin/ls /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf*.fasta | sed 's/.fasta//g' | sed 's/.*\///g' > strains.txt
# changing from PF3D7_1370300 to PF3D7_1369500 to get similar amount before the rRNA split between 11 and 13
elucidator gffRecordIDToGeneInfo --id PF3D7_0830300,PF3D7_1370300,PF3D7_1369500 --dout hrps_upStreamGeneInfos --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/Pf3D7.gff --2bit /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf3D7.2bit --overWriteDir
elucidator extractRefSeqsFromGenomes --bed hrps_upStreamGeneInfos/out_PF3D7_0830300.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedPF3D7_0830300 --overWriteDirs --numThreads 15 --extendAndTrim
elucidator extractRefSeqsFromGenomes --bed hrps_upStreamGeneInfos/out_PF3D7_1369500.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedPF3D7_1369500 --overWriteDirs --numThreads 15 --extendAndTrim
# chrom 11
elucidator gffRecordIDToGeneInfo --id PF3D7_1147700 --dout chrom11_upStreamOfrRNA --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/Pf3D7.gff --2bit /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf3D7.2bit --overWriteDir
elucidator extractRefSeqsFromGenomes --bed chrom11_upStreamOfrRNA/out_PF3D7_1147700.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedPF3D7_1147700 --overWriteDirs --numThreads 7 --extendAndTrim
```
Get chromosome lengths to know where to extend to
```{bash, eval = F}
mkdir chromLengths
cd chromLengths
for x in /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf*.fasta; do elucidator getReadLens --fasta ${x} --trimAtWhiteSpace > $(basename ${x%%.fasta}).txt; done;
cd ..
```
Extend from the genes of interest to the ends of the chromosomes
```{bash, eval = F}
mkdir endBeds
for x in /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf*.fasta; do elucidator extendToEndOfChrom --bed extractedPF3D7_0830300/Pf3D7_08_v3-1290239-1291406-rev/beds/$(basename ${x%%.fasta})_region.bed --chromLengthsTable chromLengths/$(basename ${x%%.fasta}).txt | cut -f 1-3 | elucidator bed3ToBed6 --bed STDIN --out endBeds/$(basename ${x%%.fasta})_chrom08_toEnd.bed --overWrite ; done;
for x in /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf*.fasta; do elucidator extendToEndOfChrom --bed extractedPF3D7_1369500/Pf3D7_13_v3-2769915-2773480-for/beds/$(basename ${x%%.fasta})_region.bed --chromLengthsTable chromLengths/$(basename ${x%%.fasta}).txt | cut -f 1-3 | elucidator bed3ToBed6 --bed STDIN --out endBeds/$(basename ${x%%.fasta})_chrom13_toEnd.bed --overWrite ; done;
for x in /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf*.fasta; do elucidator extendToEndOfChrom --bed extractedPF3D7_1147700/Pf3D7_11_v3-1897203-1897884-for/beds/$(basename ${x%%.fasta})_region.bed --chromLengthsTable chromLengths/$(basename ${x%%.fasta}).txt | cut -f 1-3 | elucidator bed3ToBed6 --bed STDIN --out endBeds/$(basename ${x%%.fasta})_chrom11_toEnd.bed --overWrite ; done;
cd endBeds
for x in *toEnd.bed; do elucidator gffToBedByBedLoc --extraAttributes description --feature gene,pseudogene --bed ${x} --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/$(echo $x | sed 's/_.*//g').gff --overWrite --out ${x%%.bed}_genes.bed; done;
for x in *toEnd.bed; do elucidator gffToBedByBedLoc --extraAttributes description --bed ${x} --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/$(echo $x | sed 's/_.*//g').gff --overWrite --out ${x%%.bed}_all.bed; done;
for x in *toEnd_genes.bed; do elucidator splitColumnContainingMeta --addHeader --delim tab --overWrite --removeEmptyColumn --column col.6 --file ${x} --out split_${x%%.bed}.tab.txt ; done;
```
# Read in chromosome lengths
```{r}
strains = readr::read_tsv("surroundingRegionsMaterials/strains.txt", col_names = c("strains")) %>%
filter("PfIT" != strains)
chromLengths = tibble()
for(strain in strains$strains){
chromLensFnp = paste0("surroundingRegionsMaterials/chromLengths/", strain, ".txt")
strainLens = readr::read_tsv(chromLensFnp, col_names = c("chrom", "length")) %>%
mutate(strain = strain)
chromLengths = bind_rows(chromLengths, strainLens)
}
```
# finding tandem repeats to determine TAREs
Determine tandem repeats in all genomes to determine the presence of telomere associated repeitive elements (TARE)[@Vernick1988-mu]
### TAREs
Run tandem repeat finder [@Benson1999-hx]
```{bash, eval = F}
mkdir -p surroundingRegionsMaterials/alltrfs
cd surroundingRegionsMaterials/alltrfs
rm -f runTRFCmds.txt && for x in /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/*.fasta; do echo elucidator runTRF --trimAtWhiteSpace --supplement --overWriteDir --fasta ${x} --dout trf_$(basename ${x%%.fasta}) >> runTRFCmds.txt ; done;
nohup elucidator runMultipleCommands --cmdFile runTRFCmds.txt --numThreads 16 --raw &
```
The ends of the telomeres have six blocks of repetitive sequences that were designated [telomere-associated repetitive elements (TAREs 1–6) ](https://www.sciencedirect.com/science/article/pii/0166685188900552) arranged in specific structures[@Vernick1988-mu].
All plasmodium falciparum telomeres end with a 7bp tandem repeat termed TARE-1
Subtelomeric block 1 (SB-1, equivalent to TARE-1), contains the 7-bp telomeric repeat in a variable number of near-exact copies
represented by the consensus, TT(T/C)AGGG at the ends and CCCT[AG]AA at the fronts
```{r}
startTare1Pats = c(
"^CCCT[AG]AA$",
"^CCT[AG]AAC$",
"^CT[AG]AACC$",
"^T[AG]AACCC$",
"^[AG]AACCCT$",
"^AACCCT[AG]$",
"^ACCCT[AG]A$"
)
endTare1Pats = c(
"^TT[TC]AGGG$",
"^T[TC]AGGGT$",
"^[TC]AGGGTT$",
"^AGGGTT[TC]$",
"^GGGTT[TC]A$",
"^GGTT[TC]AG$",
"^GTT[TC]AGG$"
)
startTare1Pats_14 = c(
"^CCCT[AG]AACCCT[AG]AA$",
"^CCT[AG]AACCCT[AG]AAC$",
"^CT[AG]AACCCT[AG]AACC$",
"^T[AG]AACCCT[AG]AACCC$",
"^[AG]AACCCT[AG]AACCCT$",
"^AACCCT[AG]AACCCT[AG]$",
"^ACCCT[AG]AACCCT[AG]A$"
)
endTare1Pats_14 = c(
"^TT[TC]AGGGTT[TC]AGGG$",
"^T[TC]AGGGTT[TC]AGGGT$",
"^[TC]AGGGTT[TC]AGGGTT$",
"^AGGGTT[TC]AGGGTT[TC]$",
"^GGGTT[TC]AGGGTT[TC]A$",
"^GGTT[TC]AGGGTT[TC]AG$",
"^GTT[TC]AGGGTT[TC]AGG$",
"^AGTAAGTAAGTAAG$"
)
```
### Read in repeats
```{r}
allRepeats = tibble()
for(strain in strains$strains){
repeatFnp = paste0("surroundingRegionsMaterials/alltrfs/trf_", strain, "/repeats.bed")
if(file.exists(repeatFnp)){
repeats = readr::read_tsv(repeatFnp, col_names = F) %>%
separate(X4, sep = "__", into = c("name", "repeatInfo"), remove = F) %>%
separate(repeatInfo, sep = "_x", into = c("repeatUnit", "repeatNumber"), convert = T, remove = F) %>%
mutate(repeatUnitLen = nchar(repeatUnit))%>%
mutate(tare1 = (grepl(paste0(c(startTare1Pats, startTare1Pats_14), collapse = "|"), repeatUnit) |
grepl(paste0(c(endTare1Pats, endTare1Pats_14), collapse = "|"), repeatUnit))) %>%
mutate(SB3 = (repeatUnitLen == 21 | repeatUnitLen == 42) & X5 > 1000) %>%
mutate(strain = strain) %>%
left_join(
chromLengths %>%
rename(X1 = chrom,
chromLen = length)
)
allRepeats = bind_rows(
allRepeats,
repeats
)
}
}
```
```{r}
allRepeats_filt = allRepeats %>%
filter(!(grepl("API", X1) | grepl("MIT", X1) | grepl("MT", X1)))
allRepeats_filt_starts = allRepeats_filt %>%
filter(X2 <= 120000)
allRepeats_filt_ends = allRepeats_filt %>%
mutate(endPos = chromLen - 120000) %>%
filter(X2 >= endPos)
```
```{r}
allRepeats_filt_starts_ends = bind_rows(
allRepeats_filt_starts,
allRepeats_filt_ends
)%>%
mutate(repeatType = case_when(
tare1 ~ "TARE1",
SB3 ~ "SB3",
T ~ "other"
))
allRepeats_filt_starts_ends_tares = allRepeats_filt_starts_ends %>%
filter(repeatType != "other") %>%
mutate(genome = gsub("_.*", "", X1))
for(genomeName in unique(allRepeats_filt_starts_ends_tares$genome)){
write_tsv(allRepeats_filt_starts_ends_tares %>%
filter(genome == genomeName), paste0("surroundingRegionsMaterials/alltrfs/trf_", genomeName, "/", "allRepeats_filt_starts_ends_tares.tsv"))
}
write_tsv(allRepeats_filt_starts_ends_tares, paste0("surroundingRegionsMaterials/alltrfs/", "allRepeats_filt_starts_ends_tares.tsv"))
allRepeats_filt_starts_ends_tares = allRepeats_filt_starts_ends_tares %>%
mutate(start = X2,
end = X3,
chrom = X1)
```
## finding possible fragments of 11
In these genomes the chromosomes of 11 and 13's peri-telomeric regions are not assembled well, likely due to the duplicated region on chr11 and chr13
```{bash, eval = F}
mkdir -p surroundingRegionsMaterials/possibleFragmentsOf11
cd surroundingRegionsMaterials/possibleFragmentsOf11
elucidator fastaToBed --trimAtWhiteSpace --fasta /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/PfIT.fasta | egrep PfIT_00_10 | elucidator bed3ToBed6 --bed STDIN --out PfIT_00_10.bed --overWrite
elucidator gffToBedByBedLoc --extraAttributes description --feature gene,pseudogene --bed PfIT_00_10.bed --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/PfIT.gff --overWrite --out PfIT_00_10_genes.bed
elucidator fastaToBed --trimAtWhiteSpace --fasta /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/PfKH01.fasta | egrep PfKH01_00_27 | elucidator bed3ToBed6 --bed STDIN --out PfKH01_00_27.bed --overWrite
elucidator gffToBedByBedLoc --extraAttributes description --feature gene,pseudogene --bed PfKH01_00_27.bed --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/PfKH01.gff --overWrite --out PfKH01_00_27_genes.bed
for x in *genes.bed; do elucidator splitColumnContainingMeta --addHeader --delim tab --overWrite --removeEmptyColumn --column col.6 --file ${x} --out split_${x%%.bed}.tab.txt ; done;
```
# Reading In Genes
## 8
```{r}
all08 = tibble()
for(strain in strains$strains){
strain08 = readr::read_tsv(paste0("surroundingRegionsMaterials/endBeds/split_", strain, "_chrom08_toEnd_genes.tab.txt")) %>%
mutate(strain = strain ) %>%
mutate(chromGlobal = gsub(paste0(strain, "_"), "", col.0)) %>%
mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
rename(chrom = col.0,
start = col.1,
end = col.2)
all08 = bind_rows(all08, strain08)
}
all08 = all08 %>%
mutate(description = gsub("unknownfunction", "unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-like protein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-likeprotein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description)) %>%
mutate(description = gsub(",putative", ", putative", description)) %>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub("conserved protein, unknown function", "conserved Plasmodium protein, unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse(grepl("sporozoite and liver stage tryptophan-rich protein, putative", description), "tryptophan/threonine-rich antigen", description))%>%
mutate(description = ifelse(grepl("CRA domain-containing protein, putative", description), "conserved Plasmodium protein, unknown function", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description)) %>%
mutate(description = gsub("surfaceantigen", "surface antigen", description)) %>%
mutate(description = gsub("Tetratricopeptide repeat, putative", "tetratricopeptide repeat protein, putative", description)) %>%
mutate(description = gsub("transmembraneprotein", "transmembrane protein", description)) %>%
mutate(description = ifelse(grepl("PfEMP1", description) & grepl("pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("PIR protein", "stevor", description)) %>%
mutate(description = gsub("erythrocyte membrane protein 1-like", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("acidic terminal segments, variant surface antigen of PfEMP1, putative", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description))%>%
mutate(description = ifelse(grepl("CoA binding protein", description, ignore.case = T), "acyl-CoA binding protein", description)) %>%
mutate(description = ifelse(grepl("transfer RNA", description) | grepl("tRNA", description), "tRNA", description))%>%
mutate(description = ifelse(grepl("cytoadherence", description), "CLAG", description))%>%
mutate(description = ifelse(grepl("surface-associated interspersed protein", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("SURFIN", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("stevor-like", description), "stevor, pseudogene", description)) %>%
mutate(description = ifelse(grepl("exported protein family", description), "exported protein family", description)) %>%
mutate(description = ifelse(grepl("ribosomal RNA", description), "rRNA", description)) %>%
mutate(description = ifelse(grepl("serine/threonine protein kinase", description), "serine/threonine protein kinase, FIKK family", description)) %>%
mutate(description = ifelse(grepl("hypothetical protein", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("conserved Plasmodium protein, unknown function", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("Rifin/stevor family, putative", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("stevor", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("rifin", description), "rifin", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1 (PfEMP1), pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("probably protein", description), "unspecified product", description)) %>%
mutate(description = ifelse(grepl("RESA", description), "RESA", description)) %>%
mutate(description = ifelse(grepl("ring-infected erythrocyte surface antigen", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = ifelse(grepl("Duffy binding domain/Erythrocyte binding antigen175, putative", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("erythrocyte binding like protein 1", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("unspecified product", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("probable protein, unknown function", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("RESA", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = gsub(", putative", "", description)) %>%
mutate(description = ifelse(grepl("transcription factor with AP2 domain(s)", description, fixed = T), "AP2 domain transcription factor AP2-L", description)) %>%
mutate(description = ifelse(grepl("zinc finger, C3HC4 type", description, fixed = T), "RING zinc finger protein", description)) %>%
mutate(description = ifelse(grepl("eukaryotic translation initiation factor 2 subunit alpha", description, fixed = T), "eukaryotic translation initiation factor 2 alpha subunit", description))
all08_strainStarts = all08 %>%
group_by(strain, chrom) %>%
summarise(minStart = min(start))
chromLengths_08 = chromLengths %>%
filter(chrom %in% all08_strainStarts$chrom) %>%
left_join(all08_strainStarts)%>%
mutate(globalStart = 0,
globalEnd = length - minStart) %>%
mutate(chrom = factor(chrom, levels = chrom))
all08 = all08 %>%
left_join(all08_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart)
all08 = all08 %>%
mutate(chrom = factor(chrom, levels = chromLengths_08$chrom))
```
## 13
```{r}
all13 = tibble()
for(strain in strains$strains){
strain13 = readr::read_tsv(paste0("surroundingRegionsMaterials/endBeds/split_", strain, "_chrom13_toEnd_genes.tab.txt")) %>%
mutate(strain = strain ) %>%
mutate(chromGlobal = gsub(paste0(strain, "_"), "", col.0)) %>%
mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
rename(chrom = col.0,
start = col.1,
end = col.2)
all13 = bind_rows(all13, strain13)
}
all13 = all13 %>%
filter(col.3 %!in% c("Pf3D7_13_v3_2794236_2794851_PF3D7_1370500", "Pf3D7_13_v3_2796118_2797144_PF3D7_1370800", "Pf3D7_13_v3_2797506_2798103_PF3D7_1370900")) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-like protein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-likeprotein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description)) %>%
mutate(description = gsub(",putative", ", putative", description)) %>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub("conserved protein, unknown function", "conserved Plasmodium protein, unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse(grepl("sporozoite and liver stage tryptophan-rich protein, putative", description), "tryptophan/threonine-rich antigen", description))%>%
mutate(description = ifelse(grepl("CRA domain-containing protein, putative", description), "conserved Plasmodium protein, unknown function", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description)) %>%
mutate(description = gsub("surfaceantigen", "surface antigen", description)) %>%
mutate(description = gsub("Tetratricopeptide repeat, putative", "tetratricopeptide repeat protein, putative", description)) %>%
mutate(description = gsub("transmembraneprotein", "transmembrane protein", description)) %>%
mutate(description = ifelse(grepl("PfEMP1", description) & grepl("pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("PIR protein", "stevor", description)) %>%
mutate(description = gsub("erythrocyte membrane protein 1-like", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("acidic terminal segments, variant surface antigen of PfEMP1, putative", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description))%>%
mutate(description = ifelse(grepl("CoA binding protein", description, ignore.case = T), "acyl-CoA binding protein", description)) %>%
mutate(description = ifelse(grepl("transfer RNA", description) | grepl("tRNA", description), "tRNA", description))%>%
mutate(description = ifelse(grepl("cytoadherence", description), "CLAG", description))%>%
mutate(description = ifelse(grepl("surface-associated interspersed protein", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("SURFIN", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("stevor-like", description), "stevor, pseudogene", description)) %>%
mutate(description = ifelse(grepl("exported protein family", description), "exported protein family", description)) %>%
mutate(description = ifelse(grepl("ribosomal RNA", description), "rRNA", description)) %>%
mutate(description = ifelse(grepl("serine/threonine protein kinase", description), "serine/threonine protein kinase, FIKK family", description)) %>%
mutate(description = ifelse(grepl("hypothetical protein", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("conserved Plasmodium protein, unknown function", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("Rifin/stevor family, putative", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("stevor", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("rifin", description), "rifin", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1 (PfEMP1), pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("probably protein", description), "unspecified product", description)) %>%
mutate(description = ifelse(grepl("RESA", description), "RESA", description)) %>%
mutate(description = ifelse(grepl("ring-infected erythrocyte surface antigen", description), "ring-infected erythrocyte surface antigen", description))%>%
mutate(description = ifelse(grepl("Duffy binding domain/Erythrocyte binding antigen175, putative", description), "erythrocyte binding like protein 1", description))%>%
mutate(description = ifelse(grepl("erythrocyte binding like protein 1", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("unspecified product", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("probable protein, unknown function", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("RESA", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = gsub(", putative", "", description)) %>%
mutate(description = ifelse(grepl("transcription factor with AP2 domain(s)", description, fixed = T), "AP2 domain transcription factor AP2-L", description)) %>%
mutate(description = ifelse(grepl("zinc finger, C3HC4 type", description, fixed = T), "RING zinc finger protein", description)) %>%
mutate(description = ifelse(grepl("eukaryotic translation initiation factor 2 subunit alpha", description, fixed = T), "eukaryotic translation initiation factor 2 alpha subunit", description))
all13_strainStarts = all13 %>%
group_by(strain, chrom) %>%
summarise(minStart = min(start))
all13 = all13 %>%
left_join(all13_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart)
chromLengths_13 = chromLengths %>%
filter(chrom %in% all13_strainStarts$chrom) %>%
left_join(all13_strainStarts)%>%
mutate(globalStart = 0,
globalEnd = length - minStart) %>%
mutate(chrom = factor(chrom, levels = chrom))
all13 = all13 %>%
mutate(chrom = factor(chrom, levels = chromLengths_13$chrom))
```
## 11
```{r}
all11 = tibble()
for(strain in strains$strains){
strain11 = readr::read_tsv(paste0("surroundingRegionsMaterials/endBeds/split_", strain, "_chrom11_toEnd_genes.tab.txt")) %>%
mutate(strain = strain ) %>%
mutate(chromGlobal = gsub(paste0(strain, "_"), "", col.0)) %>%
mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
rename(chrom = col.0,
start = col.1,
end = col.2)
all11 = bind_rows(all11, strain11)
}
# PfIT_00_10 = readr::read_tsv(paste0("surroundingRegionsMaterials/possibleFragmentsOf11/split_PfIT_00_10_genes.tab.txt")) %>%
# mutate(strain = "PfIT" ) %>%
# mutate(chromGlobal = gsub(paste0("PfIT", "_"), "", col.0)) %>%
# mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
# rename(chrom = col.0,
# start = col.1,
# end = col.2)
# all11 = bind_rows(all11, PfIT_00_10)
PfKH01_00_27 = readr::read_tsv(paste0("surroundingRegionsMaterials/possibleFragmentsOf11/split_PfKH01_00_27_genes.tab.txt")) %>%
mutate(strain = "PfKH01" ) %>%
mutate(chromGlobal = gsub(paste0("PfKH01", "_"), "", col.0)) %>%
mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
rename(chrom = col.0,
start = col.1,
end = col.2)
# %>%
# mutate(start = abs(start - 160672))%>%
# mutate(end = abs(end - 160672))
all11 = bind_rows(all11, PfKH01_00_27)
all11 = all11 %>%
filter(col.3 %!in% c("Pf3D7_11_v3_1916391_1918050_PF3D7_1148100", "Pf3D7_11_v3_1920267_1920811_PF3D7_1148200", "Pf3D7_11_v3_1922655_1922928_PF3D7_1148500")) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-like protein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-likeprotein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description)) %>%
mutate(description = gsub(",putative", ", putative", description)) %>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub("conserved protein, unknown function", "conserved Plasmodium protein, unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse(grepl("sporozoite and liver stage tryptophan-rich protein, putative", description), "tryptophan/threonine-rich antigen", description))%>%
mutate(description = ifelse(grepl("CRA domain-containing protein, putative", description), "conserved Plasmodium protein, unknown function", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description)) %>%
mutate(description = gsub("surfaceantigen", "surface antigen", description)) %>%
mutate(description = gsub("Tetratricopeptide repeat, putative", "tetratricopeptide repeat protein, putative", description)) %>%
mutate(description = gsub("transmembraneprotein", "transmembrane protein", description)) %>%
mutate(description = ifelse(grepl("PfEMP1", description) & grepl("pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("PIR protein", "stevor", description)) %>%
mutate(description = gsub("erythrocyte membrane protein 1-like", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("acidic terminal segments, variant surface antigen of PfEMP1, putative", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description))%>%
mutate(description = ifelse(grepl("CoA binding protein", description, ignore.case = T), "acyl-CoA binding protein", description)) %>%
mutate(description = ifelse(grepl("transfer RNA", description) | grepl("tRNA", description), "tRNA", description))%>%
mutate(description = ifelse(grepl("cytoadherence", description), "CLAG", description))%>%
mutate(description = ifelse(grepl("surface-associated interspersed protein", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("SURFIN", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("stevor-like", description), "stevor, pseudogene", description)) %>%
mutate(description = ifelse(grepl("exported protein family", description), "exported protein family", description)) %>%
mutate(description = ifelse(grepl("ribosomal RNA", description), "rRNA", description)) %>%
mutate(description = ifelse(grepl("serine/threonine protein kinase", description), "serine/threonine protein kinase, FIKK family", description)) %>%
mutate(description = ifelse(grepl("hypothetical protein", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("conserved Plasmodium protein, unknown function", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("Rifin/stevor family, putative", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("stevor", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("rifin", description), "rifin", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1 (PfEMP1), pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("probably protein", description), "unspecified product", description)) %>%
mutate(description = ifelse(grepl("RESA", description), "RESA", description)) %>%
mutate(description = ifelse(grepl("ring-infected erythrocyte surface antigen", description), "ring-infected erythrocyte surface antigen", description))%>%
mutate(description = ifelse(grepl("Duffy binding domain/Erythrocyte binding antigen175, putative", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("erythrocyte binding like protein 1", description), "erythrocyte binding like protein 1", description))%>%
mutate(description = ifelse(grepl("unspecified product", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("probable protein, unknown function", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("RESA", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = gsub(", putative", "", description)) %>%
mutate(description = ifelse(grepl("transcription factor with AP2 domain(s)", description, fixed = T), "AP2 domain transcription factor AP2-L", description)) %>%
mutate(description = ifelse(grepl("zinc finger, C3HC4 type", description, fixed = T), "RING zinc finger protein", description)) %>%
mutate(description = ifelse(grepl("eukaryotic translation initiation factor 2 subunit alpha", description, fixed = T), "eukaryotic translation initiation factor 2 alpha subunit", description))
all11_strainStarts = all11 %>%
group_by(strain, chrom) %>%
summarise(minStart = min(start))
all11 = all11 %>%
left_join(all11_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart)
chromLengths_11 = chromLengths %>%
filter(chrom %in% all11_strainStarts$chrom) %>%
left_join(all11_strainStarts) %>%
mutate(globalStart = 0,
globalEnd = length - minStart) %>%
mutate(chrom = factor(chrom, levels = chrom))
all11 = all11 %>%
mutate(chrom = factor(chrom, levels = chromLengths_11$chrom)) %>%
mutate(globalStart = ifelse("PfKH01_00_27" == chrom, globalStart + 17822, globalStart))%>%
mutate(globalEnd = ifelse("PfKH01_00_27" == chrom, globalEnd + 17822, globalEnd))
# %>%
# mutate(globalStart = ifelse("PfIT_00_10" == chrom, globalStart + 28623, globalStart))%>%
# mutate(globalEnd = ifelse("PfIT_00_10" == chrom, globalEnd + 28623 , globalEnd))
chromLengths_11 = chromLengths_11 %>%
mutate(globalStart = ifelse("PfKH01_00_27" == chrom, globalStart + 17822, globalStart))%>%
mutate(globalEnd = ifelse("PfKH01_00_27" == chrom, globalEnd + 17822, globalEnd))
# %>%
# mutate(globalStart = ifelse("PfIT_00_10" == chrom, globalStart + 28623, globalStart))%>%
# mutate(globalEnd = ifelse("PfIT_00_10" == chrom, globalEnd + 28623, globalEnd))
```
# Plotting
```{r}
descriptionColorsNames = unique(c(all08$description, all13$description, all11$description))
rawDescriptionColors = scheme$hex(length(descriptionColorsNames))
names(rawDescriptionColors) = descriptionColorsNames
genomicElementsColors = c()
genomicElementsColors["TARE1"] = "#F748A5"
genomicElementsColors["SB3"] = "#F0E442"
rawGeneAnnotationsColors = rawDescriptionColors
descriptionColors = c(rawDescriptionColors, genomicElementsColors)
geneAnnotationsColors = descriptionColors
```
## 8
```{r}
allRepeats_filt_starts_ends_tares_chrom08 = allRepeats_filt_starts_ends_tares %>%
filter(grepl("_08", X1)) %>%
left_join(all08_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart) %>%
filter(globalStart > 0)
allRepeats_filt_starts_ends_tares_chrom08 = allRepeats_filt_starts_ends_tares_chrom08 %>%
mutate(chrom = factor(chrom, levels = chromLengths_08$chrom))
chrom08_plot = ggplot() +
geom_rect(data = chromLengths_08, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4), fill= "grey91") +
geom_rect(data = all08, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4,
fill = description, ID = ID, strain = strain), color = "black") +
geom_rect(data = allRepeats_filt_starts_ends_tares_chrom08, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.2,
ymax = as.numeric(chrom) + 0.2,
fill = repeatType,
color = repeatType, strain = strain)
# , color = "black"
) +
scale_y_continuous(breaks = 1:max(as.numeric(chromLengths_08$chrom)), labels = as.character(chromLengths_08$chrom)) +
scale_fill_manual(values = descriptionColors[names(descriptionColors) %in% c(all08$description, allRepeats_filt_starts_ends_tares_chrom08$repeatType)]) +
scale_color_manual(values = descriptionColors[names(descriptionColors) %in% c(all08$description, allRepeats_filt_starts_ends_tares_chrom08$repeatType)]) +
sofonias_theme_xRotate
current_geneAnnotationsColors = geneAnnotationsColors[names(geneAnnotationsColors) %in% all08$description]
current_geneAnnotationsColors = sort(current_geneAnnotationsColors)
```
```{r}
#cairo_pdf("chrom08_plot.pdf", height = 12, width = 20)
pdf("chrom08_plot.pdf", height = 20, width = 25, useDingbats = F)
print(chrom08_plot +
scale_fill_manual(
"Genes\nDescription",
values = geneAnnotationsColors,
breaks = names(current_geneAnnotationsColors),
labels = names(current_geneAnnotationsColors),
guide = guide_legend(
override.aes = list(
color = current_geneAnnotationsColors,
fill = current_geneAnnotationsColors,
alpha = 1,
shape = 22,
size = 5
),
nrow = 7,
order = 1
)
) +
scale_color_manual(
"Genomic\nElements",
values = alpha(geneAnnotationsColors, 0),
# values = c("black", "black"),
breaks = names(genomicElementsColors),
labels = names(genomicElementsColors),
guide = guide_legend(
override.aes = list(
color = genomicElementsColors,
fill = genomicElementsColors,
alpha = 1,
shape = 22,
size = 5
),
ncol = 1,
order = 2
)
) +
theme(legend.text = element_text(size = 30), axis.text.x = element_text(size = 30, color = "black"), axis.text.y = element_text(size = 30, color = "black"),
legend.title = element_text(size = 30, face = "bold"),
legend.box="vertical", legend.margin=margin(),
legend.background = element_blank(),
legend.box.background = element_rect(colour = "black"))
)
dev.off()
```
```{r}
#| echo: false
#| results: asis
cat(createDownloadLink("chrom08_plot.pdf"))
```
```{r}
#| column: screen-inset
plotly::ggplotly(chrom08_plot)
```
## 11
```{r}
allRepeats_filt_starts_ends_tares_chrom11 = allRepeats_filt_starts_ends_tares %>%
filter(grepl("_11", X1)) %>%
left_join(all11_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart) %>%
filter(globalStart > 0)
allRepeats_filt_starts_ends_tares_chrom11 = allRepeats_filt_starts_ends_tares_chrom11 %>%
mutate(chrom = factor(chrom, levels = chromLengths_11$chrom))
chrom11_plot = ggplot() +
geom_rect(data = chromLengths_11, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4), fill= "grey91") +
geom_rect(data = all11, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4,
fill = description, ID = ID, strain = strain), color = "black") +
geom_rect(data = allRepeats_filt_starts_ends_tares_chrom11, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.2,
ymax = as.numeric(chrom) + 0.2,
fill = repeatType,
color = repeatType, strain = strain)
# , color = "black"
) +
scale_y_continuous(breaks = 1:max(as.numeric(chromLengths_11$chrom)), labels = as.character(chromLengths_11$chrom)) +
scale_fill_manual(values = descriptionColors[names(descriptionColors) %in% c(all11$description, allRepeats_filt_starts_ends_tares_chrom11$repeatType)]) +
scale_color_manual(values = descriptionColors[names(descriptionColors) %in% c(all11$description, allRepeats_filt_starts_ends_tares_chrom11$repeatType)]) +
sofonias_theme_xRotate
current_geneAnnotationsColors = geneAnnotationsColors[names(geneAnnotationsColors) %in% all11$description]
current_geneAnnotationsColors = sort(current_geneAnnotationsColors)
```
```{r}
#cairo_pdf("chrom11_plot.pdf", height = 12, width = 20)
pdf("chrom11_plot.pdf", height = 20, width = 25, useDingbats = F)
print(chrom11_plot +
scale_fill_manual(
"Genes\nDescription",
values = geneAnnotationsColors,
breaks = names(current_geneAnnotationsColors),
labels = names(current_geneAnnotationsColors),
guide = guide_legend(
override.aes = list(
color = current_geneAnnotationsColors,
fill = current_geneAnnotationsColors,
alpha = 1,
shape = 22,
size = 5
),
nrow = 7,
order = 1
)
) +
scale_color_manual(
"Genomic\nElements",
values = alpha(geneAnnotationsColors, 0),
# values = c("black", "black"),
breaks = names(genomicElementsColors),
labels = names(genomicElementsColors),
guide = guide_legend(
override.aes = list(
color = genomicElementsColors,
fill = genomicElementsColors,
alpha = 1,
shape = 22,
size = 5
),
ncol = 1,
order = 2
)
) +
theme(legend.text = element_text(size = 30), axis.text.x = element_text(size = 30, color = "black"), axis.text.y = element_text(size = 30, color = "black"),
legend.title = element_text(size = 30, face = "bold"),
legend.box="vertical", legend.margin=margin(),
legend.background = element_blank(),
legend.box.background = element_rect(colour = "black"))
)
dev.off()
```
```{r}
#| echo: false
#| results: asis
cat(createDownloadLink("chrom11_plot.pdf"))
```
```{r}
#| column: screen-inset
plotly::ggplotly(chrom11_plot)
```
## 13
```{r}
allRepeats_filt_starts_ends_tares_chrom13 = allRepeats_filt_starts_ends_tares %>%
filter(grepl("_13", X1)) %>%
left_join(all13_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart) %>%
filter(globalStart > 0)
allRepeats_filt_starts_ends_tares_chrom13 = allRepeats_filt_starts_ends_tares_chrom13 %>%
mutate(chrom = factor(chrom, levels = chromLengths_13$chrom))
chrom13_plot = ggplot() +
geom_rect(data = chromLengths_13, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4), fill= "grey91") +
geom_rect(data = all13, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4,
fill = description, ID = ID, strain = strain), color = "black") +
geom_rect(data = allRepeats_filt_starts_ends_tares_chrom13, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.2,
ymax = as.numeric(chrom) + 0.2,
fill = repeatType,
color = repeatType, strain = strain)
# , color = "black"
) +
scale_y_continuous(breaks = 1:max(as.numeric(chromLengths_13$chrom)), labels = as.character(chromLengths_13$chrom)) +
scale_fill_manual(values = descriptionColors[names(descriptionColors) %in% c(all13$description, allRepeats_filt_starts_ends_tares_chrom13$repeatType)]) +
scale_color_manual(values = descriptionColors[names(descriptionColors) %in% c(all13$description, allRepeats_filt_starts_ends_tares_chrom13$repeatType)]) +
sofonias_theme_xRotate
current_geneAnnotationsColors = geneAnnotationsColors[names(geneAnnotationsColors) %in% all13$description]
current_geneAnnotationsColors = sort(current_geneAnnotationsColors)
```
```{r}
#cairo_pdf("chrom13_plot.pdf", height = 12, width = 20)
pdf("chrom13_plot.pdf", height = 20, width = 25, useDingbats = F)
print(chrom13_plot +
scale_fill_manual(
"Genes\nDescription",
values = geneAnnotationsColors,
breaks = names(current_geneAnnotationsColors),
labels = names(current_geneAnnotationsColors),
guide = guide_legend(
override.aes = list(
color = current_geneAnnotationsColors,
fill = current_geneAnnotationsColors,
alpha = 1,
shape = 22,
size = 5
),
nrow = 9,
order = 1
)
) +
scale_color_manual(
"Genomic\nElements",
values = alpha(geneAnnotationsColors, 0),
# values = c("black", "black"),
breaks = names(genomicElementsColors),
labels = names(genomicElementsColors),
guide = guide_legend(
override.aes = list(
color = genomicElementsColors,
fill = genomicElementsColors,
alpha = 1,
shape = 22,
size = 5
),
ncol = 1,
order = 2
)
) +
theme(legend.text = element_text(size = 30), axis.text.x = element_text(size = 30, color = "black"), axis.text.y = element_text(size = 30, color = "black"),
legend.title = element_text(size = 30, face = "bold"),
legend.box="vertical", legend.margin=margin(),
legend.background = element_blank(),
legend.box.background = element_rect(colour = "black"))
)
dev.off()
```
```{r}
#| echo: false
#| results: asis
cat(createDownloadLink("chrom13_plot.pdf"))
```
```{r}
#| column: screen-inset
plotly::ggplotly(chrom13_plot)
```
## Determining core gene regions
Determining how much of the core genome are involved in the re-arrangement between chromosomes 11 and 13, using only strains that do not have evidence of these re-arrangements
```{r}
selectStrains = c("Pf3D7","Pf7G8","PfCD01","PfDd2","PfGA01","PfGB4","PfGN01","PfKE01","PfKH02","PfSN01")
```
### Chromosome 11 core region
```{r}
allRepeats_filt_starts_ends_tares_chrom11_sel_tare1 = allRepeats_filt_starts_ends_tares_chrom11 %>%
filter(strain %in% selectStrains) %>%
filter(tare1)
chromLengths_11_sel = chromLengths_11 %>%
filter(strain %in% selectStrains)
all11_sel = all11 %>%
filter(strain %in% selectStrains)
all11_sel_rRNA = all11_sel %>%
filter(description %in% c("rRNA")) %>%
group_by(strain) %>%
slice_max(end)
all11_sel_DnaJ = all11_sel %>%
filter(grepl("DnaJ", description ))
chromLengths_11_sel_full = chromLengths_11_sel %>%
select(-minStart,-globalStart,-globalEnd) %>%
filter(strain %in% allRepeats_filt_starts_ends_tares_chrom11_sel_tare1$strain)
chromLengths_11_sel_full = chromLengths_11_sel_full %>%
left_join(all11_sel_rRNA %>%
select(strain, chrom, end) %>%
rename(duplicatedRegionEnd = end)) %>%
left_join(all11_sel_DnaJ%>%
select(strain, chrom, end) %>%
rename(coreRegionEnd = end))
chromLengths_11_sel_full = chromLengths_11_sel_full %>%
mutate(coreRegionDuplicated = coreRegionEnd - duplicatedRegionEnd,
additionalParalogousRegion = length - coreRegionEnd)
create_dt(chromLengths_11_sel_full)
skimr::skim(chromLengths_11_sel_full)
```
### Chromosome 13 core region
```{r}
allRepeats_filt_starts_ends_tares_chrom13_sel_tare1 = allRepeats_filt_starts_ends_tares_chrom13 %>%
filter(strain %in% selectStrains) %>%
filter(tare1)
chromLengths_13_sel = chromLengths_13 %>%
filter(strain %in% selectStrains)
all13_sel = all13 %>%
filter(strain %in% selectStrains)
all13_sel_rRNA = all13_sel %>%
filter(description %in% c("rRNA")) %>%
group_by(strain) %>%
slice_max(end)
all13_sel_acylCoA = all13_sel %>%
filter(grepl("acyl-CoA", description ))
chromLengths_13_sel_full = chromLengths_13_sel %>%
select(-minStart,-globalStart,-globalEnd) %>%
filter(strain %in% allRepeats_filt_starts_ends_tares_chrom13_sel_tare1$strain)
chromLengths_13_sel_full = chromLengths_13_sel_full %>%
left_join(all13_sel_rRNA %>%
select(strain, chrom, end) %>%
rename(duplicatedRegionEnd = end)) %>%
left_join(all13_sel_acylCoA%>%
select(strain, chrom, end) %>%
rename(coreRegionEnd = end))
chromLengths_13_sel_full = chromLengths_13_sel_full %>%
mutate(coreRegionDeleted = coreRegionEnd - duplicatedRegionEnd,
additionalParalogousRegion = length - coreRegionEnd)
create_dt(chromLengths_13_sel_full)
skimr::skim(chromLengths_13_sel_full)
```
### Combining
```{r}
all11_rRNAEnd_3D7 = all11 %>%
filter(col.3 == "Pf3D7_11_v3_1928933_1933138_PF3D7_1148640")
all11_DNAJ = all11 %>%
filter(col.3 == "Pf3D7_11_v3_2001067_2003313_PF3D7_1149600")
chrom11CoreLen = all11_DNAJ$end - all11_rRNAEnd_3D7$end[1]
all13_rRNA = all13 %>%
filter(description %in% c("rRNA"))
all13_rRNAEnd_3D7 = all13 %>%
filter(col.3 == "Pf3D7_13_v3_2802944_2807159_PF3D7_1371300")
all13_acylCoA = all13 %>%
filter(col.3 == "Pf3D7_13_v3_2850890_2853482_PF3D7_1372400")
chrom13CoreLen = all13_acylCoA$end - all13_rRNAEnd_3D7$end[1]
create_dt(tibble(
genomeCore = c("chrom11CoreLen", "chrom13CoreLen"),
lens = c(chrom11CoreLen, chrom13CoreLen)
))
```
# pfmdr1 region
Getting the region surrounding the duplicate region of pfmdr1 on chromosome 5 associated with chr13/pfhrp3 deletion
```{bash, eval = F}
mkdir -p surroundingRegionsMaterials/pfmdr1_region
```
```{r}
pfmdr1_windows = readr::read_tsv("../windowAnalysis/windows/stableWindows_05_regionOfInterestNarrow.bed", col_names = F)
pfmdr1_windows_largeWindow = pfmdr1_windows %>%
group_by(X1) %>%
summarise(X2 = min(X2),
X3 = max(X3)) %>%
mutate(name = paste0(X1, "-", X2, "-", X3)) %>%
mutate(len = X3 - X2,
strand = "+")
write_tsv(pfmdr1_windows_largeWindow, col_names = F,file = "surroundingRegionsMaterials/pfmdr1_region/pfmdr1_windows_largeWindow.bed")
pfmdr1_windows_geneIds = pfmdr1_windows %>%
mutate(geneID = gsub("[ID=", "", X7, fixed = T)) %>%
mutate(geneID = gsub(";.*", "", geneID)) %>%
select(geneID) %>%
filter(!is.na(geneID)) %>%
arrange() %>%
unique()
write_tsv(pfmdr1_windows[1,], col_names = F, file = "surroundingRegionsMaterials/pfmdr1_region/firstRegion.bed")
cat(pfmdr1_windows_geneIds$geneID, sep = "\n", file = "surroundingRegionsMaterials/pfmdr1_region/geneIDs.txt")
```
```{bash, eval = F}
cd surroundingRegionsMaterials/pfmdr1_region
elucidator gffRecordIDToGeneInfo --id geneIDs.txt --dout pfmdr1_region_geneInfos --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/Pf3D7.gff --2bit /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf3D7.2bit --overWriteDir
elucidator extractRefSeqsFromGenomes --bed firstRegion.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir firstRegion_pfmdr1_region --overWriteDirs --numThreads 7 --extendAndTrim --coverage 95 --keepBestOnly --identity 90
elucidator extractRefSeqsFromGenomes --bed pfmdr1_region_geneInfos/out_PF3D7_0523700.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir lastGene_pfmdr1_region --overWriteDirs --numThreads 7 --extendAndTrim --coverage 95 --keepBestOnly --identity 90
mkdir regionBeds
```
```{r}
strains = readr::read_tsv("surroundingRegionsMaterials/strains.txt", col_names = c("strains"))
all_region_around_mdr1 = tibble()
for(strain in strains$strains){
firstRegion = readr::read_tsv(paste0("surroundingRegionsMaterials/pfmdr1_region/firstRegion_pfmdr1_region/Pf3D7_05_v3-929384-929984-for/beds/", strain, "_bestRegion.bed"), col_names = F)
lastRegion = readr::read_tsv(paste0("surroundingRegionsMaterials/pfmdr1_region/lastGene_pfmdr1_region/Pf3D7_05_v3-980753-985354-rev/beds/", strain, "_bestRegion.bed"), col_names = F)
region_around_mdr1 = tibble(
X1 = firstRegion$X1[1],
X2 = firstRegion$X2[1],
X3 = lastRegion$X3[1]
) %>%
mutate(name = paste0(X1, "-", X2, "-", X3),
len = X3 - X2,
strand = "+")
all_region_around_mdr1 = bind_rows(
all_region_around_mdr1,
region_around_mdr1 %>%
mutate(strain = strain)
)
write_tsv(region_around_mdr1, col_names = F, file = paste0("surroundingRegionsMaterials/pfmdr1_region/regionBeds/", strain, "_pfmdr1Region.bed"))
}
```
```{bash, eval = F}
cd surroundingRegionsMaterials/pfmdr1_region/regionBeds
for x in *pfmdr1Region.bed; do elucidator gffToBedByBedLoc --extraAttributes description --feature gene,pseudogene --bed ${x} --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/$(echo $x | sed 's/_.*//g').gff --overWrite --out ${x%%.bed}_genes.bed; done;
for x in *pfmdr1Region.bed; do elucidator gffToBedByBedLoc --extraAttributes description --bed ${x} --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/$(echo $x | sed 's/_.*//g').gff --overWrite --out ${x%%.bed}_all.bed; done;
for x in *pfmdr1Region_genes.bed; do elucidator splitColumnContainingMeta --addHeader --delim tab --overWrite --removeEmptyColumn --column col.6 --file ${x} --out split_${x%%.bed}.tab.txt ; done;
```
## Reading In Genes
### Chr5
```{r}
all05 = tibble()
for(strain in strains$strains){
strain05 = readr::read_tsv(paste0("surroundingRegionsMaterials/pfmdr1_region/regionBeds/split_", strain, "_pfmdr1Region_genes.tab.txt")) %>%
mutate(strain = strain ) %>%
mutate(chromGlobal = gsub(paste0(strain, "_"), "", col.0)) %>%
mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>%
rename(chrom = col.0,
start = col.1,
end = col.2)
all05 = bind_rows(all05, strain05)
}
all05 = all05 %>%
mutate(description = gsub("unknownfunction", "unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-like protein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated erythrocyte binding-likeprotein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description)) %>%
mutate(description = gsub(",putative", ", putative", description)) %>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub("conserved protein, unknown function", "conserved Plasmodium protein, unknown function", description)) %>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description))%>%
mutate(description = gsub("unknownfunction", "unknown function", description))%>%
mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>%
mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>%
mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>%
mutate(description = ifelse(grepl("sporozoite and liver stage tryptophan-rich protein, putative", description), "tryptophan/threonine-rich antigen", description))%>%
mutate(description = ifelse(grepl("CRA domain-containing protein, putative", description), "conserved Plasmodium protein, unknown function", description))%>%
mutate(description = gsub(",pseudogene", ", pseudogene", description)) %>%
mutate(description = gsub("surfaceantigen", "surface antigen", description)) %>%
mutate(description = gsub("Tetratricopeptide repeat, putative", "tetratricopeptide repeat protein, putative", description)) %>%
mutate(description = gsub("transmembraneprotein", "transmembrane protein", description)) %>%
mutate(description = ifelse(grepl("PfEMP1", description) & grepl("pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("PIR protein", "stevor", description)) %>%
mutate(description = gsub("erythrocyte membrane protein 1-like", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = gsub("acidic terminal segments, variant surface antigen of PfEMP1, putative", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>%
mutate(description = ifelse(grepl("CoA binding protein", description, ignore.case = T), "acyl-CoA binding protein", description)) %>%
mutate(description = ifelse(grepl("transfer RNA", description) | grepl("tRNA", description), "tRNA", description))%>%
mutate(description = ifelse(grepl("cytoadherence", description), "CLAG", description))%>%
mutate(description = ifelse(grepl("surface-associated interspersed protein", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("SURFIN", description), "SURFIN", description))%>%
mutate(description = ifelse(grepl("stevor-like", description), "stevor, pseudogene", description)) %>%
mutate(description = ifelse(grepl("exported protein family", description), "exported protein family", description)) %>%
mutate(description = ifelse(grepl("ribosomal RNA", description), "rRNA", description)) %>%
mutate(description = ifelse(grepl("serine/threonine protein kinase", description), "serine/threonine protein kinase, FIKK family", description)) %>%
mutate(description = ifelse(grepl("hypothetical protein", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("conserved Plasmodium protein, unknown function", description), "hypothetical protein, conserved", description)) %>%
mutate(description = ifelse(grepl("Rifin/stevor family, putative", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("stevor", description), "stevor", description)) %>%
mutate(description = ifelse(grepl("rifin", description), "rifin", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1 (PfEMP1), pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("erythrocyte membrane protein 1", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>%
mutate(description = ifelse(grepl("probably protein", description), "unspecified product", description)) %>%
mutate(description = ifelse(grepl("RESA", description), "RESA", description)) %>%
mutate(description = ifelse(grepl("ring-infected erythrocyte surface antigen", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = ifelse(grepl("Duffy binding domain/Erythrocyte binding antigen175, putative", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("erythrocyte binding like protein 1", description), "erythrocyte binding like protein 1", description)) %>%
mutate(description = ifelse(grepl("unspecified product", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("probable protein, unknown function", description), "hypothetical protein, conserved", description))%>%
mutate(description = ifelse(grepl("RESA", description), "ring-infected erythrocyte surface antigen", description)) %>%
mutate(description = gsub("iron-sulfur assembly protein, putative", "iron-sulfur assembly protein", description)) %>%
mutate(description = gsub("mitochondrial processing peptidase alphasubunit, putative", "mitochondrial-processing peptidase subunit alpha, putative", description)) %>%
mutate(description = gsub(", putative", "", description)) %>%
mutate(description = ifelse(grepl("transcription factor with AP2 domain(s)", description, fixed = T), "AP2 domain transcription factor AP2-L", description)) %>%
mutate(description = ifelse(grepl("zinc finger, C3HC4 type", description, fixed = T), "RING zinc finger protein", description)) %>%
mutate(description = ifelse(grepl("eukaryotic translation initiation factor 2 subunit alpha", description, fixed = T), "eukaryotic translation initiation factor 2 alpha subunit", description))
all05_strainStarts = all05 %>%
group_by(strain, chrom) %>%
summarise(minStart = min(start))
chromLengths_05 = all_region_around_mdr1 %>%
select(X1, X3, strain) %>%
rename(chrom = X1,
end = X3) %>%
left_join(all05_strainStarts)%>%
mutate(globalStart = 0,
globalEnd = end - minStart) %>%
mutate(chrom = factor(chrom, levels = chrom))
all05 = all05 %>%
left_join(all05_strainStarts) %>%
mutate(globalStart = start - minStart,
globalEnd = end - minStart)
all05 = all05 %>%
mutate(chrom = factor(chrom, levels = chromLengths_05$chrom))
```
## Plotting pfmdr1 regions
```{r}
descriptionColorsNames = unique(c(all05$description))
rawDescriptionColors = scheme$hex(length(descriptionColorsNames))
names(rawDescriptionColors) = descriptionColorsNames
rawGeneAnnotationsColors = rawDescriptionColors
descriptionColors = c(rawDescriptionColors)
```
```{r}
chrom05_plot = ggplot() +
geom_rect(data = chromLengths_05, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4), fill= "grey91") +
geom_rect(data = all05, aes(xmin = globalStart, xmax = globalEnd,
ymin = as.numeric(chrom) - 0.4,
ymax = as.numeric(chrom) + 0.4,
fill = description, ID = ID, strain = strain), color = "black") +
scale_y_continuous(breaks = 1:max(as.numeric(chromLengths_05$chrom)), labels = as.character(chromLengths_05$chrom)) +
scale_fill_manual(values = descriptionColors[names(descriptionColors) %in% c(all05$description)]) +
scale_color_manual(values = descriptionColors[names(descriptionColors) %in% c(all05$description)]) +
sofonias_theme_xRotate
current_geneAnnotationsColors = rawGeneAnnotationsColors[names(rawGeneAnnotationsColors) %in% all05$description]
current_geneAnnotationsColors = sort(current_geneAnnotationsColors)
```
```{r}
#cairo_pdf("chrom05_plot.pdf", height = 12, width = 20)
pdf("chrom05_plot.pdf", height = 12, width = 20, useDingbats = F)
print(chrom05_plot +
scale_fill_manual(
"Genes\nDescription",
values = descriptionColors,
breaks = names(descriptionColors),
labels = names(descriptionColors),
guide = guide_legend(
override.aes = list(
color = descriptionColors,
fill = descriptionColors,
alpha = 1,
shape = 22,
size = 5
),
nrow = 3,
order = 1
)
)
)
dev.off()
```
```{r}
#| echo: false
#| results: asis
cat(createDownloadLink("chrom05_plot.pdf"))
```
```{r}
#| column: screen-inset
plotly::ggplotly(chrom05_plot)
```
<!-- # Investigating other rRNA loci -->
<!-- The re-arrangement event around 11 and 13 which results in the deletion of 13 segment that includes pfhrp3 and subsequent duplication of chr11 appears to be associated with the rRNA loci between chr 11 and chr 13. The only other 2 intact rRNA loci (18S-5.8S-28S) is found on chromosome 5 and chromosome 7, will plot out these areas as well for investigation. -->
<!-- ```{bash, eval = F} -->
<!-- mkdir -p MappingOutSurroundingRegions/surroundingRegionsMaterials/chr5_chr7_rRNA -->
<!-- cd MappingOutSurroundingRegions/surroundingRegionsMaterials/chr5_chr7_rRNA -->
<!-- elucidator createBedRegionFromName --names Pf3D7_05_v3-1289161-1296225-for,Pf3D7_07_v3-1083318-1090383-for > chr5_chr7_rRNA_dup_regions.bed -->
<!-- elucidator extendBedRegions --left 50000 --bed chr5_chr7_rRNA_dup_regions.bed --out chr5_chr7_rRNA_dup_regions_upstream_50kb.bed --overWrite -->
<!-- elucidator gffToBedByBedLoc --extraAttributes description --feature gene,pseudogene --bed chr5_chr7_rRNA_dup_regions_upstream_50kb.bed --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/Pf3D7.gff --overWrite --out chr5_chr7_rRNA_dup_regions_upstream_50kb_genes.bed -->
<!-- elucidator splitColumnContainingMeta --addHeader --delim tab --overWrite --removeEmptyColumn --column col.6 --file chr5_chr7_rRNA_dup_regions_upstream_50kb_genes.bed --out split_chr5_chr7_rRNA_dup_regions_upstream_50kb_genes.tab.txt -->
<!-- ``` -->
<!-- ```{bash, eval = F} -->
<!-- elucidator gffRecordIDToGeneInfo --id PF3D7_0530600,PF3D7_0724800 --dout chr5_chr7_rRNA_upStreamGeneInfos --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/Pf3D7.gff --2bit /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf3D7.2bit --overWriteDir -->
<!-- elucidator extractRefSeqsFromGenomes --bed chr5_chr7_rRNA_upStreamGeneInfos/out_PF3D7_0530600.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedPF3D7_0530600 --overWriteDirs --numThreads 7 --extendAndTrim -->
<!-- elucidator extractRefSeqsFromGenomes --bed chr5_chr7_rRNA_upStreamGeneInfos/out_PF3D7_0724800.1.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedPF3D7_0724800 --overWriteDirs --numThreads 7 --extendAndTrim -->
<!-- elucidator createBedRegionFromName --names Pf3D7_05_v3-1238632-1239632 > Pf3D7_05_v3-1238632-1239632.bed -->
<!-- elucidator extractRefSeqsFromGenomes --bed Pf3D7_05_v3-1238632-1239632.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedPf3D7_05_v3-1238632-1239632 --overWriteDirs --numThreads 7 --extendAndTrim -->
<!-- elucidator createBedRegionFromName --names Pf3D7_07_v3-1034968-1035968 > Pf3D7_07_v3-1034968-1035968.bed -->
<!-- elucidator extractRefSeqsFromGenomes --bed Pf3D7_07_v3-1034968-1035968.bed --genomeDir /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/ --primaryGenome Pf3D7 --outputDir extractedPf3D7_07_v3-1034968-1035968 --overWriteDirs --numThreads 7 --extendAndTrim -->
<!-- ``` -->
<!-- Extend from the genes of interest to the ends of the chromosomes -->
<!-- ```{bash, eval = F} -->
<!-- mkdir endBeds -->
<!-- for x in /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf*.fasta; do elucidator extendToEndOfChrom --bed extractedPf3D7_05_v3-1238632-1239632/Pf3D7_05_v3-1238632-1239632-for/beds/$(basename ${x%%.fasta})_region.bed --chromLengthsTable ../chromLengths/$(basename ${x%%.fasta}).txt | cut -f 1-3 | elucidator bed3ToBed6 --bed STDIN --out endBeds/$(basename ${x%%.fasta})_chrom05_toEnd.bed --overWrite ; done; -->
<!-- for x in /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/genomes/Pf*.fasta; do elucidator extendToEndOfChrom --bed extractedPf3D7_07_v3-1034968-1035968/Pf3D7_07_v3-1034968-1035968-for/beds/$(basename ${x%%.fasta})_region.bed --chromLengthsTable ../chromLengths/$(basename ${x%%.fasta}).txt | cut -f 1-3 | elucidator bed3ToBed6 --bed STDIN --out endBeds/$(basename ${x%%.fasta})_chrom07_toEnd.bed --overWrite ; done; -->
<!-- cd endBeds -->
<!-- for x in *toEnd.bed; do elucidator gffToBedByBedLoc --extraAttributes description --feature gene,pseudogene --bed ${x} --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/$(echo $x | sed 's/_.*//g').gff --overWrite --out ${x%%.bed}_genes.bed; done; -->
<!-- for x in *toEnd.bed; do elucidator gffToBedByBedLoc --extraAttributes description --bed ${x} --gff /tank/data/genomes/plasmodium/genomes/pf_plusPfSD01/info/gff/$(echo $x | sed 's/_.*//g').gff --overWrite --out ${x%%.bed}_all.bed; done; -->
<!-- for x in *toEnd_genes.bed; do elucidator splitColumnContainingMeta --addHeader --delim tab --overWrite --removeEmptyColumn --column col.6 --file ${x} --out split_${x%%.bed}.tab.txt ; done; -->
<!-- ``` -->
<!-- ## Reading In Genes -->
<!-- ## Read in chromosome lengths -->
<!-- ```{r} -->
<!-- strains = readr::read_tsv("surroundingRegionsMaterials/strains.txt", col_names = c("strains")) -->
<!-- chromLengths = tibble() -->
<!-- for(strain in strains$strains){ -->
<!-- chromLensFnp = paste0("surroundingRegionsMaterials/chromLengths/", strain, ".txt") -->
<!-- strainLens = readr::read_tsv(chromLensFnp, col_names = c("chrom", "length")) %>% -->
<!-- mutate(strain = strain) -->
<!-- chromLengths = bind_rows(chromLengths, strainLens) -->
<!-- } -->
<!-- ``` -->
<!-- ### chr 05 -->
<!-- ```{r} -->
<!-- all05 = tibble() -->
<!-- for(strain in strains$strains){ -->
<!-- strain05 = readr::read_tsv(paste0("surroundingRegionsMaterials/chr5_chr7_rRNA/endBeds/split_", strain, "_chrom05_toEnd_genes.tab.txt")) %>% -->
<!-- mutate(strain = strain ) %>% -->
<!-- mutate(chromGlobal = gsub(paste0(strain, "_"), "", col.0)) %>% -->
<!-- mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>% -->
<!-- rename(chrom = col.0, -->
<!-- start = col.1, -->
<!-- end = col.2) -->
<!-- all05 = bind_rows(all05, strain05) -->
<!-- } -->
<!-- all05 = all05 %>% -->
<!-- mutate(description = gsub("unknownfunction", "unknown function", description)) %>% -->
<!-- mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>% -->
<!-- mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>% -->
<!-- mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>% -->
<!-- mutate(description = ifelse("membrane associated erythrocyte binding-like protein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>% -->
<!-- mutate(description = ifelse("membrane associated erythrocyte binding-likeprotein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>% -->
<!-- mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description)) %>% -->
<!-- mutate(description = gsub(",putative", ", putative", description)) %>% -->
<!-- mutate(description = gsub(",pseudogene", ", pseudogene", description))%>% -->
<!-- mutate(description = gsub("unknownfunction", "unknown function", description))%>% -->
<!-- mutate(description = gsub("conserved protein, unknown function", "conserved Plasmodium protein, unknown function", description)) %>% -->
<!-- mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>% -->
<!-- mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>% -->
<!-- mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>% -->
<!-- mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description))%>% -->
<!-- mutate(description = gsub(",pseudogene", ", pseudogene", description))%>% -->
<!-- mutate(description = gsub("unknownfunction", "unknown function", description))%>% -->
<!-- mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>% -->
<!-- mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>% -->
<!-- mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>% -->
<!-- mutate(description = ifelse(grepl("sporozoite and liver stage tryptophan-rich protein, putative", description), "tryptophan/threonine-rich antigen", description))%>% -->
<!-- mutate(description = ifelse(grepl("CRA domain-containing protein, putative", description), "conserved Plasmodium protein, unknown function", description))%>% -->
<!-- mutate(description = gsub(",pseudogene", ", pseudogene", description)) %>% -->
<!-- mutate(description = gsub("surfaceantigen", "surface antigen", description)) %>% -->
<!-- mutate(description = gsub("Tetratricopeptide repeat, putative", "tetratricopeptide repeat protein, putative", description)) %>% -->
<!-- mutate(description = gsub("transmembraneprotein", "transmembrane protein", description)) %>% -->
<!-- mutate(description = ifelse(grepl("PfEMP1", description) & grepl("pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>% -->
<!-- mutate(description = gsub("PIR protein", "stevor", description)) %>% -->
<!-- mutate(description = gsub("erythrocyte membrane protein 1-like", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>% -->
<!-- mutate(description = gsub("acidic terminal segments, variant surface antigen of PfEMP1, putative", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description))%>% -->
<!-- mutate(description = ifelse(grepl("CoA binding protein", description, ignore.case = T), "acyl-CoA binding protein", description)) %>% -->
<!-- mutate(description = ifelse(grepl("transfer RNA", description) | grepl("tRNA", description), "tRNA", description))%>% -->
<!-- mutate(description = ifelse(grepl("cytoadherence", description), "CLAG", description))%>% -->
<!-- mutate(description = ifelse(grepl("surface-associated interspersed protein", description), "SURFIN", description))%>% -->
<!-- mutate(description = ifelse(grepl("SURFIN", description), "SURFIN", description))%>% -->
<!-- mutate(description = ifelse(grepl("stevor-like", description), "stevor, pseudogene", description)) %>% -->
<!-- mutate(description = ifelse(grepl("exported protein family", description), "exported protein family", description)) %>% -->
<!-- mutate(description = ifelse(grepl("ribosomal RNA", description), "rRNA", description)) %>% -->
<!-- mutate(description = ifelse(grepl("serine/threonine protein kinase", description), "serine/threonine protein kinase, FIKK family", description)) %>% -->
<!-- mutate(description = ifelse(grepl("hypothetical protein", description), "hypothetical protein, conserved", description)) %>% -->
<!-- mutate(description = ifelse(grepl("conserved Plasmodium protein, unknown function", description), "hypothetical protein, conserved", description)) %>% -->
<!-- mutate(description = ifelse(grepl("Rifin/stevor family, putative", description), "stevor", description)) %>% -->
<!-- mutate(description = ifelse(grepl("stevor", description), "stevor", description)) %>% -->
<!-- mutate(description = ifelse(grepl("rifin", description), "rifin", description)) %>% -->
<!-- mutate(description = ifelse(grepl("erythrocyte membrane protein 1 (PfEMP1), pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>% -->
<!-- mutate(description = ifelse(grepl("erythrocyte membrane protein 1", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>% -->
<!-- mutate(description = ifelse(grepl("probably protein", description), "unspecified product", description)) %>% -->
<!-- mutate(description = ifelse(grepl("RESA", description), "RESA", description)) %>% -->
<!-- mutate(description = ifelse(grepl("ring-infected erythrocyte surface antigen", description), "ring-infected erythrocyte surface antigen", description)) %>% -->
<!-- mutate(description = ifelse(grepl("Duffy binding domain/Erythrocyte binding antigen175, putative", description), "erythrocyte binding like protein 1", description)) %>% -->
<!-- mutate(description = ifelse(grepl("erythrocyte binding like protein 1", description), "erythrocyte binding like protein 1", description)) %>% -->
<!-- mutate(description = ifelse(grepl("unspecified product", description), "hypothetical protein, conserved", description))%>% -->
<!-- mutate(description = ifelse(grepl("probable protein, unknown function", description), "hypothetical protein, conserved", description))%>% -->
<!-- mutate(description = ifelse(grepl("RESA", description), "ring-infected erythrocyte surface antigen", description)) %>% -->
<!-- mutate(description = gsub(", putative", "", description)) %>% -->
<!-- mutate(description = ifelse(grepl("transcription factor with AP2 domain(s)", description, fixed = T), "AP2 domain transcription factor AP2-L", description)) %>% -->
<!-- mutate(description = ifelse(grepl("zinc finger, C3HC4 type", description, fixed = T), "RING zinc finger protein", description)) %>% -->
<!-- mutate(description = ifelse(grepl("eukaryotic translation initiation factor 2 subunit alpha", description, fixed = T), "eukaryotic translation initiation factor 2 alpha subunit", description)) %>% -->
<!-- mutate(description = ifelse(grepl("dolichol-phosphate mannosyltransferase subunit3", description, fixed = T), "dolichol-phosphate mannosyltransferase subunit 3", description)) %>% -->
<!-- mutate(description = ifelse(grepl("SURF1 domain-containing protein", description, fixed = T), "hypothetical protein, conserved", description)) %>% -->
<!-- mutate(description = ifelse(grepl("ribosomal protein S16, mitochondrial", description, fixed = T), "mitochondrial ribosomal protein S16 precursor", description)) %>% -->
<!-- filter(col.3 %!in% c("Pf3D7_05_v3_1288558_1289308_PF3D7_0531500", "Pf3D7_05_v3_1291692_1292051_PF3D7_0531700", "Pf3D7_05_v3_1292210_1292409_PF3D7_0531900")) -->
<!-- all05_strainStarts = all05 %>% -->
<!-- group_by(strain, chrom) %>% -->
<!-- summarise(minStart = min(start)) -->
<!-- chromLengths_05 = chromLengths %>% -->
<!-- filter(chrom %in% all05_strainStarts$chrom) %>% -->
<!-- left_join(all05_strainStarts)%>% -->
<!-- mutate(globalStart = 0, -->
<!-- globalEnd = length - minStart) %>% -->
<!-- mutate(chrom = factor(chrom, levels = chrom)) -->
<!-- all05 = all05 %>% -->
<!-- left_join(all05_strainStarts) %>% -->
<!-- mutate(globalStart = start - minStart, -->
<!-- globalEnd = end - minStart) -->
<!-- all05 = all05 %>% -->
<!-- mutate(chrom = factor(chrom, levels = chromLengths_05$chrom)) -->
<!-- ``` -->
<!-- ### chr 07 -->
<!-- ```{r} -->
<!-- all07 = tibble() -->
<!-- for (strain in strains$strains) { -->
<!-- strain07 = readr::read_tsv( -->
<!-- paste0( -->
<!-- "surroundingRegionsMaterials/chr5_chr7_rRNA/endBeds/split_", -->
<!-- strain, -->
<!-- "_chrom07_toEnd_genes.tab.txt" -->
<!-- ) -->
<!-- ) %>% -->
<!-- mutate(strain = strain) %>% -->
<!-- mutate(chromGlobal = gsub(paste0(strain, "_"), "", col.0)) %>% -->
<!-- mutate(chromGlobal = gsub("_v3", "", chromGlobal)) %>% -->
<!-- rename(chrom = col.0, -->
<!-- start = col.1, -->
<!-- end = col.2) -->
<!-- all07 = bind_rows(all07, strain07) -->
<!-- } -->
<!-- all07 = all07 %>% -->
<!-- mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>% -->
<!-- mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>% -->
<!-- mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>% -->
<!-- mutate(description = ifelse("membrane associated erythrocyte binding-like protein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>% -->
<!-- mutate(description = ifelse("membrane associated erythrocyte binding-likeprotein"==description, "merozoite adhesive erythrocytic binding protein", description)) %>% -->
<!-- mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description)) %>% -->
<!-- mutate(description = gsub(",putative", ", putative", description)) %>% -->
<!-- mutate(description = gsub(",pseudogene", ", pseudogene", description))%>% -->
<!-- mutate(description = gsub("unknownfunction", "unknown function", description))%>% -->
<!-- mutate(description = gsub("conserved protein, unknown function", "conserved Plasmodium protein, unknown function", description)) %>% -->
<!-- mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>% -->
<!-- mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>% -->
<!-- mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>% -->
<!-- mutate(description = ifelse("membrane associated histidine-rich protein 1"==description, "membrane associated histidine-rich protein", description))%>% -->
<!-- mutate(description = gsub(",pseudogene", ", pseudogene", description))%>% -->
<!-- mutate(description = gsub("unknownfunction", "unknown function", description))%>% -->
<!-- mutate(description = gsub(" \\(SURFIN", "\\(SURFIN", description)) %>% -->
<!-- mutate(description = ifelse(grepl("Plasmodium exported protein", description), "Plasmodium exported protein (PHIST)", description)) %>% -->
<!-- mutate(description = ifelse("erythrocyte membrane protein 1 (PfEMP1), exon 2"==description, "erythrocyte membrane protein 1 (PfEMP1), exon 2, pseudogene", description)) %>% -->
<!-- mutate(description = ifelse(grepl("sporozoite and liver stage tryptophan-rich protein, putative", description), "tryptophan/threonine-rich antigen", description))%>% -->
<!-- mutate(description = ifelse(grepl("CRA domain-containing protein, putative", description), "conserved Plasmodium protein, unknown function", description))%>% -->
<!-- mutate(description = gsub(",pseudogene", ", pseudogene", description)) %>% -->
<!-- mutate(description = gsub("surfaceantigen", "surface antigen", description)) %>% -->
<!-- mutate(description = gsub("Tetratricopeptide repeat, putative", "tetratricopeptide repeat protein, putative", description)) %>% -->
<!-- mutate(description = gsub("transmembraneprotein", "transmembrane protein", description)) %>% -->
<!-- mutate(description = ifelse(grepl("PfEMP1", description) & grepl("pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>% -->
<!-- mutate(description = gsub("PIR protein", "stevor", description)) %>% -->
<!-- mutate(description = gsub("erythrocyte membrane protein 1-like", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description)) %>% -->
<!-- mutate(description = gsub("acidic terminal segments, variant surface antigen of PfEMP1, putative", "erythrocyte membrane protein 1 (PfEMP1), pseudogene", description))%>% -->
<!-- mutate(description = ifelse(grepl("CoA binding protein", description, ignore.case = T), "acyl-CoA binding protein", description)) %>% -->
<!-- mutate(description = ifelse(grepl("transfer RNA", description) | grepl("tRNA", description), "tRNA", description))%>% -->
<!-- mutate(description = ifelse(grepl("cytoadherence", description), "CLAG", description))%>% -->
<!-- mutate(description = ifelse(grepl("surface-associated interspersed protein", description), "SURFIN", description))%>% -->
<!-- mutate(description = ifelse(grepl("SURFIN", description), "SURFIN", description))%>% -->
<!-- mutate(description = ifelse(grepl("stevor-like", description), "stevor, pseudogene", description)) %>% -->
<!-- mutate(description = ifelse(grepl("exported protein family", description), "exported protein family", description)) %>% -->
<!-- mutate(description = ifelse(grepl("ribosomal RNA", description), "rRNA", description)) %>% -->
<!-- mutate(description = ifelse(grepl("serine/threonine protein kinase", description), "serine/threonine protein kinase, FIKK family", description)) %>% -->
<!-- mutate(description = ifelse(grepl("hypothetical protein", description), "hypothetical protein, conserved", description)) %>% -->
<!-- mutate(description = ifelse(grepl("conserved Plasmodium protein, unknown function", description), "hypothetical protein, conserved", description)) %>% -->
<!-- mutate(description = ifelse(grepl("Rifin/stevor family, putative", description), "stevor", description)) %>% -->
<!-- mutate(description = ifelse(grepl("stevor", description), "stevor", description)) %>% -->
<!-- mutate(description = ifelse(grepl("rifin", description), "rifin", description)) %>% -->
<!-- mutate(description = ifelse(grepl("erythrocyte membrane protein 1 (PfEMP1), pseudogene", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>% -->
<!-- mutate(description = ifelse(grepl("erythrocyte membrane protein 1", description), "erythrocyte membrane protein 1 (PfEMP1)", description)) %>% -->
<!-- mutate(description = ifelse(grepl("probably protein", description), "unspecified product", description)) %>% -->
<!-- mutate(description = ifelse(grepl("RESA", description), "RESA", description)) %>% -->
<!-- mutate(description = ifelse(grepl("ring-infected erythrocyte surface antigen", description), "ring-infected erythrocyte surface antigen", description))%>% -->
<!-- mutate(description = ifelse(grepl("Duffy binding domain/Erythrocyte binding antigen175, putative", description), "erythrocyte binding like protein 1", description))%>% -->
<!-- mutate(description = ifelse(grepl("erythrocyte binding like protein 1", description), "erythrocyte binding like protein 1", description)) %>% -->
<!-- mutate(description = ifelse(grepl("unspecified product", description), "hypothetical protein, conserved", description))%>% -->
<!-- mutate(description = ifelse(grepl("probable protein, unknown function", description), "hypothetical protein, conserved", description))%>% -->
<!-- mutate(description = ifelse(grepl("RESA", description), "ring-infected erythrocyte surface antigen", description)) %>% -->
<!-- mutate(description = gsub(", putative", "", description)) %>% -->
<!-- mutate(description = ifelse(grepl("transcription factor with AP2 domain(s)", description, fixed = T), "AP2 domain transcription factor AP2-L", description)) %>% -->
<!-- mutate(description = ifelse(grepl("zinc finger, C3HC4 type", description, fixed = T), "RING zinc finger protein", description)) %>% -->
<!-- mutate(description = ifelse(grepl("eukaryotic translation initiation factor 2 subunit alpha", description, fixed = T), "eukaryotic translation initiation factor 2 alpha subunit", description)) %>% -->
<!-- mutate(description = ifelse(grepl("eukaryotic translation initiation factor 2 alphasubunit", description, fixed = T), "eukaryotic translation initiation factor 2 alpha subunit", description)) %>% -->
<!-- mutate(description = ifelse(grepl("mitochondrial import inner membrane translocasesubunit TIM50", description, fixed = T), "mitochondrial import inner membrane translocase subunit TIM50", description)) %>% -->
<!-- mutate(description = ifelse(grepl("ribosome biogenesis protein BRX1 homolog", description, fixed = T), "ribosome biogenesis protein BRX1", description)) %>% -->
<!-- mutate(description = ifelse(grepl("dolichyl-diphosphooligosaccharide--protein glycosyltransferase subunit OST2", description, fixed = T), "dolichyl-diphosphooligosaccharide--proteinglycosyltransferase subunit DAD1", description)) %>% -->
<!-- mutate(description = ifelse(grepl("kelch protein", description, fixed = T), "kelch domain-containing protein", description)) %>% -->
<!-- mutate(description = ifelse(grepl("mTERF domain-containing protein", description, fixed = T), "hypothetical protein, conserved", description)) %>% -->
<!-- mutate(description = ifelse(grepl("IMP1-like protein", description, fixed = T), "hypothetical protein, conserved", description)) %>% -->
<!-- filter(col.3 != "PfIT_07_1376876_1377089_PfIT_070036200")%>% -->
<!-- filter(col.3 %!in% c("Pf3D7_07_v3_1085847_1086212_PF3D7_0725700", "Pf3D7_07_v3_1086371_1086569_PF3D7_0725900")) -->
<!-- all07_strainStarts = all07 %>% -->
<!-- group_by(strain, chrom) %>% -->
<!-- summarise(minStart = min(start)) -->
<!-- all07 = all07 %>% -->
<!-- left_join(all07_strainStarts) %>% -->
<!-- mutate(globalStart = start - minStart, -->
<!-- globalEnd = end - minStart) -->
<!-- chromLengths_07 = chromLengths %>% -->
<!-- filter(chrom %in% all07_strainStarts$chrom) %>% -->
<!-- left_join(all07_strainStarts)%>% -->
<!-- mutate(globalStart = 0, -->
<!-- globalEnd = length - minStart) %>% -->
<!-- mutate(chrom = factor(chrom, levels = chrom)) -->
<!-- all07 = all07 %>% -->
<!-- mutate(chrom = factor(chrom, levels = chromLengths_07$chrom)) -->
<!-- ``` -->
<!-- ## Plotting -->
<!-- ```{r} -->
<!-- descriptionColorsNames = sort(unique(c(all05$description, all07$description))) -->
<!-- rawDescriptionColors = scheme$hex(length(descriptionColorsNames)) -->
<!-- names(rawDescriptionColors) = descriptionColorsNames -->
<!-- genomicElementsColors = c() -->
<!-- genomicElementsColors["TARE1"] = "#F748A5" -->
<!-- genomicElementsColors["SB3"] = "#F0E442" -->
<!-- rawGeneAnnotationsColors = rawDescriptionColors -->
<!-- descriptionColors = c(rawDescriptionColors, genomicElementsColors) -->
<!-- geneAnnotationsColors = descriptionColors -->
<!-- chr05_chr07_geneAnnotationColors = rawDescriptionColors -->
<!-- save(chr05_chr07_geneAnnotationColors, file = "chr05_chr07_geneAnnotationColors.Rdata") -->
<!-- ``` -->
<!-- ### 05 -->
<!-- ```{r} -->
<!-- allRepeats_filt_starts_ends_tares_chrom05 = allRepeats_filt_starts_ends_tares %>% -->
<!-- filter(grepl("_05", X1)) %>% -->
<!-- left_join(all05_strainStarts) %>% -->
<!-- mutate(globalStart = start - minStart, -->
<!-- globalEnd = end - minStart) %>% -->
<!-- filter(globalStart > 0) -->
<!-- allRepeats_filt_starts_ends_tares_chrom05 = allRepeats_filt_starts_ends_tares_chrom05 %>% -->
<!-- mutate(chrom = factor(chrom, levels = chromLengths_05$chrom)) -->
<!-- chrom05_plot = ggplot() + -->
<!-- geom_rect(data = chromLengths_05, aes(xmin = globalStart, xmax = globalEnd, -->
<!-- ymin = as.numeric(chrom) - 0.4, -->
<!-- ymax = as.numeric(chrom) + 0.4), fill= "grey91") + -->
<!-- geom_rect(data = all05, aes(xmin = globalStart, xmax = globalEnd, -->
<!-- ymin = as.numeric(chrom) - 0.4, -->
<!-- ymax = as.numeric(chrom) + 0.4, -->
<!-- fill = description, ID = ID, strain = strain), color = "black") + -->
<!-- geom_rect(data = allRepeats_filt_starts_ends_tares_chrom05, aes(xmin = globalStart, xmax = globalEnd, -->
<!-- ymin = as.numeric(chrom) - 0.2, -->
<!-- ymax = as.numeric(chrom) + 0.2, -->
<!-- fill = repeatType, -->
<!-- color = repeatType, strain = strain) -->
<!-- # , color = "black" -->
<!-- ) + -->
<!-- scale_y_continuous(breaks = 1:max(as.numeric(chromLengths_05$chrom)), labels = as.character(chromLengths_05$chrom)) + -->
<!-- scale_fill_manual(values = descriptionColors[names(descriptionColors) %in% c(all05$description, allRepeats_filt_starts_ends_tares_chrom05$repeatType)]) + -->
<!-- scale_color_manual(values = descriptionColors[names(descriptionColors) %in% c(all05$description, allRepeats_filt_starts_ends_tares_chrom05$repeatType)]) + -->
<!-- sofonias_theme_xRotate -->
<!-- current_geneAnnotationsColors = geneAnnotationsColors[names(geneAnnotationsColors) %in% all05$description] -->
<!-- current_geneAnnotationsColors = current_geneAnnotationsColors[sort(names(current_geneAnnotationsColors))] -->
<!-- ``` -->
<!-- ```{r} -->
<!-- #cairo_pdf("surroundingRegionsMaterials/chr5_chr7_rRNA/chrom05_plot.pdf", height = 12, width = 30) -->
<!-- pdf("surroundingRegionsMaterials/chr5_chr7_rRNA/chrom05_plot.pdf", height = 12, width = 30, useDingbats = F) -->
<!-- print(chrom05_plot + -->
<!-- scale_fill_manual( -->
<!-- "Genes\nDescription", -->
<!-- values = geneAnnotationsColors, -->
<!-- breaks = names(current_geneAnnotationsColors), -->
<!-- labels = names(current_geneAnnotationsColors), -->
<!-- guide = guide_legend( -->
<!-- override.aes = list( -->
<!-- color = current_geneAnnotationsColors, -->
<!-- fill = current_geneAnnotationsColors, -->
<!-- alpha = 1, -->
<!-- shape = 22, -->
<!-- size = 5 -->
<!-- ), -->
<!-- nrow = 3, -->
<!-- order = 1 -->
<!-- ) -->
<!-- ) + -->
<!-- scale_color_manual( -->
<!-- "Genomic\nElements", -->
<!-- values = alpha(geneAnnotationsColors, 0), -->
<!-- # values = c("black", "black"), -->
<!-- breaks = names(genomicElementsColors), -->
<!-- labels = names(genomicElementsColors), -->
<!-- guide = guide_legend( -->
<!-- override.aes = list( -->
<!-- color = genomicElementsColors, -->
<!-- fill = genomicElementsColors, -->
<!-- alpha = 1, -->
<!-- shape = 22, -->
<!-- size = 5 -->
<!-- ), -->
<!-- ncol = 1, -->
<!-- order = 2 -->
<!-- ) -->
<!-- ) -->
<!-- ) -->
<!-- dev.off() -->
<!-- ``` -->
<!-- ```{r} -->
<!-- #| echo: false -->
<!-- #| results: asis -->
<!-- cat(createDownloadLink("surroundingRegionsMaterials/chr5_chr7_rRNA/chrom05_plot.pdf")) -->
<!-- ``` -->
<!-- ```{r} -->
<!-- #| column: screen-inset -->
<!-- plotly::ggplotly(chrom05_plot) -->
<!-- ``` -->
<!-- ### 07 -->
<!-- ```{r} -->
<!-- allRepeats_filt_starts_ends_tares_chrom07 = allRepeats_filt_starts_ends_tares %>% -->
<!-- filter(grepl("_07", X1)) %>% -->
<!-- left_join(all07_strainStarts) %>% -->
<!-- mutate(globalStart = start - minStart, -->
<!-- globalEnd = end - minStart) %>% -->
<!-- filter(globalStart > 0) -->
<!-- allRepeats_filt_starts_ends_tares_chrom07 = allRepeats_filt_starts_ends_tares_chrom07 %>% -->
<!-- mutate(chrom = factor(chrom, levels = chromLengths_07$chrom)) -->
<!-- chrom07_plot = ggplot() + -->
<!-- geom_rect(data = chromLengths_07, aes(xmin = globalStart, xmax = globalEnd, -->
<!-- ymin = as.numeric(chrom) - 0.4, -->
<!-- ymax = as.numeric(chrom) + 0.4), fill= "grey91") + -->
<!-- geom_rect(data = all07, aes(xmin = globalStart, xmax = globalEnd, -->
<!-- ymin = as.numeric(chrom) - 0.4, -->
<!-- ymax = as.numeric(chrom) + 0.4, -->
<!-- fill = description, ID = ID, strain = strain), color = "black") + -->
<!-- geom_rect(data = allRepeats_filt_starts_ends_tares_chrom07, aes(xmin = globalStart, xmax = globalEnd, -->
<!-- ymin = as.numeric(chrom) - 0.2, -->
<!-- ymax = as.numeric(chrom) + 0.2, -->
<!-- fill = repeatType, -->
<!-- color = repeatType, strain = strain) -->
<!-- # , color = "black" -->
<!-- ) + -->
<!-- scale_y_continuous(breaks = 1:max(as.numeric(chromLengths_07$chrom)), labels = as.character(chromLengths_07$chrom)) + -->
<!-- scale_fill_manual(values = descriptionColors[names(descriptionColors) %in% c(all07$description, allRepeats_filt_starts_ends_tares_chrom07$repeatType)]) + -->
<!-- scale_color_manual(values = descriptionColors[names(descriptionColors) %in% c(all07$description, allRepeats_filt_starts_ends_tares_chrom07$repeatType)]) + -->
<!-- sofonias_theme_xRotate -->
<!-- # + -->
<!-- # theme(axis.text.x = element_blank(), -->
<!-- # axis.ticks.x = element_blank()) -->
<!-- current_geneAnnotationsColors = geneAnnotationsColors[names(geneAnnotationsColors) %in% all07$description] -->
<!-- current_geneAnnotationsColors = current_geneAnnotationsColors[sort(names(current_geneAnnotationsColors))] -->
<!-- ``` -->
<!-- ```{r} -->
<!-- # cairo_pdf("surroundingRegionsMaterials/chr5_chr7_rRNA/chrom07_plot.pdf", height = 12, width = 35) -->
<!-- pdf("surroundingRegionsMaterials/chr5_chr7_rRNA/chrom07_plot.pdf", height = 12, width = 35, useDingbats = F) -->
<!-- print(chrom07_plot + -->
<!-- scale_fill_manual( -->
<!-- "Genes\nDescription", -->
<!-- values = geneAnnotationsColors, -->
<!-- breaks = names(current_geneAnnotationsColors), -->
<!-- labels = names(current_geneAnnotationsColors), -->
<!-- guide = guide_legend( -->
<!-- override.aes = list( -->
<!-- color = current_geneAnnotationsColors, -->
<!-- fill = current_geneAnnotationsColors, -->
<!-- alpha = 1, -->
<!-- shape = 22, -->
<!-- size = 5 -->
<!-- ), -->
<!-- nrow = 7, -->
<!-- order = 1 -->
<!-- ) -->
<!-- ) + -->
<!-- scale_color_manual( -->
<!-- "Genomic\nElements", -->
<!-- values = alpha(geneAnnotationsColors, 0), -->
<!-- # values = c("black", "black"), -->
<!-- breaks = names(genomicElementsColors), -->
<!-- labels = names(genomicElementsColors), -->
<!-- guide = guide_legend( -->
<!-- override.aes = list( -->
<!-- color = genomicElementsColors, -->
<!-- fill = genomicElementsColors, -->
<!-- alpha = 1, -->
<!-- shape = 22, -->
<!-- size = 5 -->
<!-- ), -->
<!-- ncol = 1, -->
<!-- order = 2 -->
<!-- ) -->
<!-- ) -->
<!-- ) -->
<!-- dev.off() -->
<!-- ``` -->
<!-- ```{r} -->
<!-- #| echo: false -->
<!-- #| results: asis -->
<!-- cat(createDownloadLink("surroundingRegionsMaterials/chr5_chr7_rRNA/chrom07_plot.pdf")) -->
<!-- ``` -->
<!-- ```{r} -->
<!-- #| column: screen-inset -->
<!-- plotly::ggplotly(chrom07_plot) -->
<!-- ``` -->
<!-- ```{r} -->
<!-- nonParalogousStableWindows = readr::read_tsv("../windowAnalysis/windows/mergedFinalLargeWindows.bed", -->
<!-- col_names = F) %>% -->
<!-- bind_rows( -->
<!-- readr::read_tsv( -->
<!-- "../windowAnalysis/fine_tuning_regions/chr5_chr7_rRNA/finalTunned_duplicatedRegionchr05chr07.bed", -->
<!-- col_names = F -->
<!-- ), -->
<!-- readr::read_tsv( -->
<!-- "../windowAnalysis/fine_tuning_regions/chr7_post_dupRegion/merged_passingInitialFilter_withGeneInfo.bed", -->
<!-- col_names = F -->
<!-- ) -->
<!-- ) %>% -->
<!-- arrange(X1, X2, X3) -->
<!-- chr5_chr7_rRNA_dup_regions_upstream_50kb = readr::read_tsv("surroundingRegionsMaterials/chr5_chr7_rRNA/chr5_chr7_rRNA_dup_regions_upstream_50kb_genes.bed", col_names = F) %>% -->
<!-- group_by(X1) %>% -->
<!-- summarise(minStart = min(X2)) -->
<!-- nonParalogousStableWindows_05_07_regionOfInterest = nonParalogousStableWindows %>% -->
<!-- filter(X1 %in% chr5_chr7_rRNA_dup_regions_upstream_50kb$X1) %>% -->
<!-- left_join(chr5_chr7_rRNA_dup_regions_upstream_50kb) %>% -->
<!-- filter(X2 > minStart) %>% -->
<!-- select(-minStart) %>% -->
<!-- filter("Pf3D7_07_v3-1265874-1278574" != X4) -->
<!-- # get rid of dynein heavy chain, putative, it's 12700 and most of the time will be invariable given the nature of that gene -->
<!-- write_tsv(nonParalogousStableWindows_05_07_regionOfInterest, "../windowAnalysis/windows/nonParalogousStableWindows_05_07_regionOfInterest.bed", col_names = F) -->
<!-- write_tsv(nonParalogousStableWindows_05_07_regionOfInterest %>% -->
<!-- filter(!(X1 == "Pf3D7_07_v3" & X2 >=1156595)), "../windowAnalysis/windows/nonParalogousStableWindows_05_07_regionOfInterestNarrow.bed", col_names = F) -->
<!-- ``` -->
<!-- ```{bash, eval = F} -->
<!-- nohup elucidator runMultipleCommands --cmdFile runRegionBamTrimCmds.txt --additionalFields "{NCPUS}:5;{SAMPLE}:selectControlSamples.txt;{DIRNAME}:nonParalogousStableWindows_05_07_regionOfInterest;{BEDFNP}:\"/Users/nick/Dropbox_Personal/ownCloud/documents/plasmodium/falciparum/hrps/windowSelectionSurroundingHRPs/Analysis_Surrounding_HRP2_3_deletions/windowAnalysis/windows/nonParalogousStableWindows_05_07_regionOfInterest.bed\"" --numThreads 3 --replaceFields & -->
<!-- ``` -->