Code
metadata = readr::read_tsv("DeletionPatternAnalysis/Supplemental Table - sample metadata - field samples and Lab Isolates.tsv")
metadata_mod = metadata %>%
mutate(SRARuns = strsplit(SRARuns, split = ",")) %>%
unnest(SRARuns)
pf7_samples = readr::read_tsv("meta/Pf7_samples.txt")
pf7_samples_mod = pf7_samples %>%
mutate(ENA = strsplit(ENA, ",")) %>%
unnest(ENA) %>%
mutate(prefix = substr(ENA, 1, 3))
metadata_mod_notPf7 = metadata_mod %>%
filter(SRARuns %!in% pf7_samples_mod$ENA)
cat(metadata_mod$SRARuns, file = "meta/allSRARunAccessionsUsed.txt", sep = "\n")
cat(metadata_mod_notPf7$SRARuns, file = "meta/allSRARunAccessionsUsedNotInPf7.txt", sep = "\n")
Code
rm -f getInfoCmds.txt && for x in `cat allSRARunAccessionsUsed.txt`; do echo "fastq-dl --only-download-metadata --prefix ${x} -a ${x} > ${x}_fastq-dl.log.txt 2>&1" >> getInfoCmds.txt ; done;
conda activate fastq-dl
nohup elucidator runMultipleCommands --cmdFile getInfoCmds.txt --numThreads 1 --raw &
elucidator rBind --contains=-run-info.tsv --header --delim tab --fill --overWrite --out allRunInfos.tsv
elucidator countColumn --file allRunInfos.tsv --header --delim tab --columnName study_accession
mkdir allRunInfos
mv *run-info.tsv allRunInfos
tar -zcvf allRunInfos.tar.gz allRunInfos
rm -fr allRunInfos
Code
mkdir notinpf7
cd notinpf7
rm -f getInfoCmds.txt && for x in `cat ../allSRARunAccessionsUsedNotInPf7.txt`; do echo "fastq-dl --only-download-metadata --prefix ${x} -a ${x} > ${x}_fastq-dl.log.txt 2>&1" >> getInfoCmds.txt ; done;
conda activate fastq-dl
nohup elucidator runMultipleCommands --cmdFile getInfoCmds.txt --numThreads 1 --raw &
elucidator rBind --contains=-run-info.tsv --header --delim tab --fill --overWrite --out allRunInfos.tsv
elucidator countColumn --file allRunInfos.tsv --header --delim tab --columnName study_accession
mkdir allRunInfos
mv *run-info.tsv allRunInfos
tar -zcvf allRunInfos.tar.gz allRunInfos
rm -fr allRunInfos
Code
allInfo = readr::read_tsv("meta/forhrp23paper/allRunInfos.tsv")
infoSamplesNotP7 = readr::read_tsv("meta/forhrp23paper/notinpf7/allRunInfos.tsv")
allInfo_pf7 = allInfo %>%
filter(run_accession %in% pf7_samples_mod$ENA)
infoSamplesNotP7_projects_notInP7 = infoSamplesNotP7 %>%
filter(study_accession %!in% allInfo_pf7$study_accession)
infoSamplesNotP7_projects_notInP7_studiesCount = infoSamplesNotP7_projects_notInP7 %>%
group_by(study_accession) %>%
count() %>%
arrange(n)
write_tsv(infoSamplesNotP7_projects_notInP7_studiesCount, "meta/studies_not_pf7.txt")
All studies not found in Pf7 were then searched on SRA and publications found if possible.
Code
infoSamplesNotP7_projects_notInP7_studiesCount_withInfo = readr::read_tsv("meta/studies_not_pf7_withDeterminedPubs.tsv")
create_dt(infoSamplesNotP7_projects_notInP7_studiesCount_withInfo)