DENTIST: Test Data

This directory contains all the data and commands required to produce the results presented in the manuscript

Arne Ludwig, Martin Pippel, Gene Myers, Michael Hiller. DENTIST – close assembly gaps with high confidence. In preparation.

Index of Files

Table of Contents

Naming Conventions

Filename Meaning
dentist DENTIST
pbjelly PBJelly
finisher_sc FinisherSC
lr_gapcloser LR_gapcloser
arrow PacBio GenomicConsensus (quiver or arrow)

Directory Structure

./data

Ground-truth and test assemblies and matching read data (simulated and real). The data is grouped by ground-truth assembly which is generally different between simulated and real reads (except for C. anna).

Structure of ./data/*/

List of subfolders

./results

Gap-closed assemblies and results of the automatic evaluation.

./source

Scripts and workflow files required to run the gap closing software and analysis.

Executing the Tools

DENTIST

cd ./source/dentist

COMPARISON_DATASETS=(
    d_melanogaster/simulated-pb
    d_melanogaster_pacbio/real-pb
    a_thaliana/simulated-pb
    a_thaliana_pacbio/real-pb
    c_anna/simulated-pb
    c_anna/real-pb
    h_sapiens/simulated-pb
    h_sapiens_real/real-pb
    h_sapiens_real/real-onp
)

# runs for comprehensive comparison
for DATASET in "${COMPARISON_DATASETS[@]}"
do
    SKIP_LACHECK=1 ./snakemake_dentist.sh "$DATASET" \
        -p --profile=slurm --restart-times=2
    ../check-result.sh dentist "$DATASET" all
done

# runs for scaffolding analysis
for DATASET in "${COMPARISON_DATASETS[@]}"
do
    SKIP_LACHECK=1 ./snakemake_dentist.sh --base-config=scaffolding \
        "$DATASET" \
        -p --profile=slurm --restart-times=2
done

COVERAGE_DATASETS=(
    d_melanogaster/simulated-pb-{5,6,7,8,9,10,12,14,16,18,20,25,30,40,50,60,70,80,90,100}x
)

# runs for coverage analysis
for DATASET in "${COVERAGE_DATASETS[@]}"
do
    SKIP_LACHECK=1 ./snakemake_dentist.sh "$DATASET" \
        -p --profile=slurm --restart-times=2
    ../check-result.sh dentist "$DATASET" all
done

PBJelly

cd ./source/pbjelly

COMPARISON_DATASETS=(
    d_melanogaster
    d_melanogaster_pacbio
    a_thaliana
    a_thaliana_pacbio
    c_anna
    c_anna_pacbio
    h_sapiens
    h_sapiens_real
)

for DATASET in "${COMPARISON_DATASETS[@]}"
do
    ./pipeline.sh "$DATASET"
    ../check-result.sh pbjelly "$DATASET" all
done

FinisherSC

cd ./source/finisher_sc

COMPARISON_DATASETS=(
    d_melanogaster
    d_melanogaster_pacbio
)

for DATASET in "${COMPARISON_DATASETS[@]}"
do
    snakemake --configfile="config/$DATASET.yml" \
        -p --profile=slurm --restart-times=2
    ../check-result.sh finisher_sc "$DATASET" all
done

LR_gapcloser

cd ./source/lr_gapcloser

COMPARISON_DATASETS=(
    d_melanogaster
    d_melanogaster_pacbio
    a_thaliana
    a_thaliana_pacbio
    c_anna
    c_anna_pacbio
    h_sapiens
    h_sapiens_real
)

for DATASET in "${COMPARISON_DATASETS[@]}"
do
    snakemake --configfile="$DATASET.yml" \
        -p --profile=slurm --restart-times=2
    ../check-result.sh lr_gapcloser "$DATASET" all
done

PacBio GenomicConsensus

cd ./source/arrow

COMPARISON_DATASETS=(
    d_melanogaster_pacbio
    a_thaliana_pacbio
    c_anna_pacbio
    h_sapiens_real
)

for DATASET in "${COMPARISON_DATASETS[@]}"
do
    snakemake --configfile="config/$DATASET.yml" \
        -p --profile=slurm --restart-times=2
    ../check-result.sh arrow "$DATASET" all
done