run_napls_qc.sh

#!/bin/bash
# =============================================================================
# NAPLS3 Genomic Data QC Pipeline - Master Orchestration Script
# Individual scripts in /gprep/processed_genotype can also be run sequentially
# =============================================================================
# Purpose: Coordinates the complete QC workflow for NAPLS3 genomic data through
#          the ENIGMA-DTI pre-imputation QC pipeline (with SNP renaming).
#
# Workflow stages:
# 1. Create rsid binaries - One-time setup of dbSNP156 binary files for variant mapping
# 2. Rename SNPs - Convert variants to standard rsID naming
# 3. ENIGMA-DTI QC - Three-part QC protocol for DTI genetic analysis:
#    - Part 1: Initial filtering, sex checks, phenotype coding
#    - Part 2: Relatedness analysis, HapMap merging, MDS analysis for ancestry
#    - Part 3: PCA covariate generation, statistics collection, final packaging
#
# Usage:
#   qsub run_napls_qc.sh
#
# Required inputs:
#   - NAPLS3 genotype data (.bed/.bim/.fam) in $NAPLS3_DIR
#   - Phenotype and DTI data in $INFO_DIR (see subscripts)
#
# Dependencies:
#   - PLINK 1.9, PLINK 2.0
#   - rsid_tools (https://github.com/HTGenomeAnalysisUnit/rsid_tools)
#   - R with required packages (see part3 script)
# =============================================================================

# --- Job parameters [adjust as needed]---
#$ -cwd
#$ -l h_rt=3:00:00,h_data=2G
#$ -N NAPLS_QC_Pipeline
#$ -j y
#$ -o "$HOME/project-cbearden/napls/gprep/processed_genotype/logs/$JOB_ID_napls_qc_master.log"

# --- Set up environment ---
# Base project directories [adjust as needed]
export NAPLS3_DIR="/u/project/cbearden/hughesdy/NAPLS/raw_genotype/NAPLS3"
export PLINK2="/u/project/cbearden/hughesdy/software/plink2/plink2"
export PLINK19="/u/project/cbearden/hughesdy/software/plinkv1.9/plink"
export RSID_TOOLS="$HOME/apps/rsid_tools/bin/rsid_tools"

# Job project directories [adjust as needed]
export WORK_DIR="/u/home/c/cobeaman/project-cbearden/napls/gprep/processed_genotype"
export SCRATCH_DIR="/u/scratch/c/cobeaman/napls_qc_$JOB_ID"
export LOG_DIR="${WORK_DIR}/logs/${JOB_ID}_napls_qc_master"
export ENIGMA_DIR="${WORK_DIR}/enigma/DTIgenetics"
export INFO_DIR="${ENIGMA_DIR}/info"
export RS_BIN_DIR="${HOME}/scratch/GRCh37_dbSNP156_Binaries/Standard"
export STATE_FILE="${LOG_DIR}/pipeline_state.txt"
export CHECKPOINT_FILE="${LOG_DIR}/napls3_qc_checkpoint.txt"
export SUMMARY_FILE="${WORK_DIR}/napls3_qc_pipeline_summary.txt"

# --- Functions ---
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] - $1" | tee -a "${LOG_DIR}/napls3_qc_run.log"; }
err() { log "ERROR: $1"; return 1; }
warn() { log "WARNING: $1"; }
success() { log "SUCCESS: $1"; }
confirm_file() { [[ -s "$1" ]] || err "Required file not found or empty: $1"; }
set_state() { echo "$1" > "$STATE_FILE"; }
get_state() { [[ -f "$STATE_FILE" ]] && cat "$STATE_FILE" || echo "init"; }
step_completed() { grep -q "^COMPLETED: ${1}$" "${CHECKPOINT_FILE}" 2>/dev/null; }
mark_completed() { echo "COMPLETED: ${1}" >> "${CHECKPOINT_FILE}"; log "Checkpoint saved: ${1}"; }

# Find function to cache results and avoid repetitive filesystem operations
cached_find() {
    local dir="$1" pattern="$2" type="$3" sort_opt="$4" count="$5"
    local cache_key="${dir//\//_}_${pattern//\//_}_${type}_${sort_opt}_${count}"
    local cache_file="${SCRATCH_DIR}/cache_${cache_key}"
    
    if [[ -f "$cache_file" ]]; then
        cat "$cache_file"
    else
        mkdir -p "$(dirname "$cache_file")"
        if [[ "$sort_opt" == "time" ]]; then
            find "$dir" -maxdepth 1 -name "$pattern" -type "$type" -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -"${count:-1}" | cut -d' ' -f2- | tee "$cache_file"
        else
            find "$dir" -maxdepth 1 -name "$pattern" -type "$type" 2>/dev/null | head -"${count:-1}" | tee "$cache_file"
        fi
    fi
}

run_step() {
    local script="$1" description="$2" timeout="${3:-7200}" state_name="$4"
    local current_state=$(get_state)
    
    # Check if this step should be skipped
    if step_completed "$state_name"; then
        log "SKIPPING: $description (already completed)"
        return 0
    fi
    
    # Mark step as in progress
    set_state "${state_name}_in_progress"
    
    # Run the step with timeout protection
    log "STARTING: $description"
    if timeout "$timeout" bash "$script" 2>&1 | tee -a "${LOG_DIR}/napls3_qc_run.log"; then
        if [[ ${PIPESTATUS[0]} -eq 0 ]]; then
            log "COMPLETED: $description"
            set_state "${state_name}_completed"
            mark_completed "$state_name"
            return 0
        else
            err "$description failed with exit code ${PIPESTATUS[0]}. See log for details."
            return 1
        fi
    else
        local exit_code=$?
        if [[ $exit_code -eq 124 ]]; then
            err "$description timed out after $timeout seconds"
        else
            err "$description failed with exit code $exit_code"
        fi
        return 1
    fi
}

verify_prerequisites() {
    log "Verifying prerequisites..."
    local missing=0

    # Check required commands with specific version requirements where applicable
    local cmds=("$PLINK19" "$PLINK2" "$RSID_TOOLS" "timeout" "bc")
    for cmd in "${cmds[@]}"; do
        command -v "$cmd" >/dev/null 2>&1 || { err "Command not found: $cmd"; missing=1; }
    done

    # Check input files (all 3 PLINK files must exist)
    for ext in bed bim fam; do
        confirm_file "${NAPLS3_DIR}/NAPLS3_n710.${ext}" || { missing=1; }
    done
    
    # Essential data files for ENIGMA QC
    mkdir -p "$INFO_DIR"
    for req_file in "${INFO_DIR}/NAPLS3_Terra_samplestab_phenofile.txt" "${INFO_DIR}/napls3_MS_diffusion.csv"; do
        if [[ ! -f "$req_file" ]]; then
            warn "Required file for ENIGMA QC not found: $req_file"
            warn "Please ensure this file exists before running the ENIGMA QC steps."
        fi
    done

    return $missing
}

# Function to handle directory validation
verify_directory() {
    local dir_var="$1" dir_name="$2" required="$3"
    local dir_path="${!dir_var}"
    
    if [[ ! -d "$dir_path" ]]; then
        if [[ "$required" == "required" ]]; then
            err "$dir_name directory not found at: $dir_path"
            return 1
        else
            warn "$dir_name directory not found at: $dir_path"
            if [[ "$required" == "create" ]]; then
                log "Creating $dir_name directory: $dir_path"
                mkdir -p "$dir_path" || { err "Failed to create $dir_name directory"; return 1; }
                success "Created $dir_name directory: $dir_path"
            fi
        fi
    fi
    return 0
}

# Create essential directories with error handling
setup_directories() {
    log "Setting up directory structure..."
    local dirs=(
        "WORK_DIR:work:create" 
        "SCRATCH_DIR:scratch:create"
        "LOG_DIR:log:create"
        "ENIGMA_DIR:ENIGMA:create"
        "INFO_DIR:phenotype info:create"
    )
    
    for dir_info in "${dirs[@]}"; do
        IFS=':' read -r dir_var dir_name required <<< "$dir_info"
        verify_directory "$dir_var" "$dir_name" "$required" || return 1
    done
    
    # Create checkpoint file
    touch "$CHECKPOINT_FILE" || { err "Failed to create checkpoint file"; return 1; }
    return 0
}

# Graceful handling of unexpected termination
cleanup() {
    log "Pipeline interrupted or terminated. Saving state for resume capability."
    # Don't delete anything - allow for resumption
    exit 1
}

# --- Main workflow ---
main() {
    # Set up traps for signal handling
    trap cleanup SIGHUP SIGINT SIGTERM
    
    log "Starting NAPLS3 Genomic Data QC Pipeline"
    
    # Create directory structure
    setup_directories || { err "Failed to setup directories"; exit 1; }
    
    # Verify prerequisites
    verify_prerequisites || { 
        err "Prerequisite check failed. Fix errors before proceeding."
        log "TIP: Ensure all required software and input files are available and accessible."
        exit 1
    }
    
    # --- STAGE 1: Binary creation (one-time setup) ---
    if ! step_completed "rsid_binaries"; then
        log "Step 1 of 5: Creating RSID binaries (one-time setup)"
        if [[ -d "$RS_BIN_DIR" && -f "${RS_BIN_DIR}/GRCh37_1.hash2rsid.bin" ]]; then
            log "RSID binaries already exist at ${RS_BIN_DIR} (skipping creation)"
            mark_completed "rsid_binaries"
        else
            run_step "${WORK_DIR}/01_create_rsid_binaries.sh" "Creation of RSID binary files" 28800 "rsid_binaries" || {
                err "Failed to create RSID binaries. Please check logs at ${LOG_DIR}"
                exit 1
            }
        fi
    else
        log "Step 1 of 5: RSID binaries creation already completed (skipping)"
    fi

    # --- STAGE 2: SNP Renaming ---
    if ! step_completed "rename_snps"; then
        log "Step 2 of 5: Renaming SNPs to standard rsIDs"
        run_step "${WORK_DIR}/01_rename_snps_direct.sh" "SNP renaming process" 7200 "rename_snps" || {
            err "SNP renaming failed. Please check logs at ${LOG_DIR}"
            exit 1
        }
    else
        log "Step 2 of 5: SNP renaming already completed (skipping)"
    fi

    # Find the most recent renamed genotype files more efficiently
    RENAMED_PREFIX=$(cached_find "${WORK_DIR}" "NAPLS3_n710_renamed_*[0-9]*.bed" "f" "time" 1)
    RENAMED_PREFIX="${RENAMED_PREFIX%.bed}"
    
    if [[ -z "$RENAMED_PREFIX" ]]; then
        err "No renamed genotype files found after SNP renaming step. Pipeline cannot continue."
        log "TIP: Check if the SNP renaming step completed successfully and produced output files."
        exit 1
    fi
    log "Using renamed genotype files: ${RENAMED_PREFIX}"

    # --- STAGE 3: ENIGMA-DTI QC Part 1 ---
    if ! step_completed "enigma_qc_part1"; then
        log "Step 3 of 5: ENIGMA-DTI QC Part 1 (Initial filtering, sex checks, phenotype coding)"
        run_step "${WORK_DIR}/02_enigma_dti_qc_napls3_part1.sh" "ENIGMA-DTI QC Part 1" 3600 "enigma_qc_part1" || {
            err "ENIGMA-DTI QC Part 1 failed. Please check logs and fix issues before continuing."
            exit 1
        }
    else
        log "Step 3 of 5: ENIGMA-DTI QC Part 1 already completed (skipping)"
    fi

    # Find Part 1 output directory with optimized search
    PART1_DIR=$(cached_find "${ENIGMA_DIR}" "*_enigma_dti_qc_napls3_part1" "d" "time" 1)
    if [[ -z "$PART1_DIR" ]]; then
        err "Part 1 output directory not found. Cannot proceed to next step."
        log "TIP: Check if Part 1 completed successfully and created its output directory."
        exit 1
    fi
    log "Using Part 1 results from: $PART1_DIR"

    # Verify Part 1 outputs with a more efficient check
    PART1_QC1_FILE=$(cached_find "${PART1_DIR}" "*_QC1.bed" "f" "" 1)
    if [[ -z "$PART1_QC1_FILE" ]]; then
        err "Critical Part 1 output files (*_QC1.bed) not found in ${PART1_DIR}"
        log "TIP: Check Part 1 logs for errors that may have prevented output generation."
        exit 1
    fi
    log "Found QC1 dataset: ${PART1_QC1_FILE%.bed}.[bed,bim,fam]"

    # --- STAGE 4: ENIGMA-DTI QC Part 2 ---
    if ! step_completed "enigma_qc_part2"; then
        log "Step 4 of 5: ENIGMA-DTI QC Part 2 (Relatedness checks, HapMap merging, MDS analysis)"
        run_step "${WORK_DIR}/02_enigma_dti_qc_napls3_part2.sh" "ENIGMA-DTI QC Part 2" 3600 "enigma_qc_part2" || {
            err "ENIGMA-DTI QC Part 2 failed. Please check logs and fix issues before continuing."
            exit 1
        }
    else
        log "Step 4 of 5: ENIGMA-DTI QC Part 2 already completed (skipping)"
    fi

    # Find Part 2 output directory
    PART2_DIR=$(cached_find "${ENIGMA_DIR}" "*_enigma_dti_qc_napls3_part2" "d" "time" 1)
    if [[ -z "$PART2_DIR" ]]; then
        err "Part 2 output directory not found. Cannot proceed to next step."
        log "TIP: Check if Part 2 completed successfully and created its output directory."
        exit 1
    fi
    log "Using Part 2 results from: $PART2_DIR"

    # Verify Part 2 outputs
    PART2_QC3_FILE=$(cached_find "${PART2_DIR}" "*_QC3.bed" "f" "" 1)
    if [[ -z "$PART2_QC3_FILE" ]]; then
        err "Critical Part 2 output files (*_QC3.bed) not found in ${PART2_DIR}"
        log "TIP: Check Part 2 logs for errors that may have prevented output generation."
        exit 1
    fi
    log "Found QC3 dataset: ${PART2_QC3_FILE%.bed}.[bed,bim,fam]"

    # --- STAGE 5: ENIGMA-DTI QC Part 3 ---
    if ! step_completed "enigma_qc_part3"; then
        log "Step 5 of 5: ENIGMA-DTI QC Part 3 (PCA covariates, summary statistics, packaging)"
        run_step "${WORK_DIR}/02_enigma_dti_qc_napls3_part3.sh" "ENIGMA-DTI QC Part 3" 3600 "enigma_qc_part3" || {
            err "ENIGMA-DTI QC Part 3 failed. Please check logs for details."
            log "TIP: Check Part 3 logs for specific error messages."
            exit 1
        }
    else
        log "Step 5 of 5: ENIGMA-DTI QC Part 3 already completed (skipping)"
    fi

    # Find Part 3 output directory
    PART3_DIR=$(cached_find "${ENIGMA_DIR}" "*_enigma_dti_qc_napls3_part3" "d" "time" 1)
    if [[ -z "$PART3_DIR" ]]; then
        err "Part 3 output directory not found. Final results may be missing."
        log "TIP: Check if Part 3 completed successfully and created its output directory."
        exit 1
    fi
    log "Using Part 3 results from: $PART3_DIR"

    # --- Generate comprehensive pipeline summary ---
    OUTPUT_ZIP=$(cached_find "${PART3_DIR}" "*_ENIGMA-DTI_FilesToSend.zip" "f" "" 1)
    QC3_FILES=$(cached_find "${PART3_DIR}" "*_QC3.bed" "f" "" 1)
    
    {
        echo "==================================================================="
        echo "NAPLS3 QC PIPELINE SUMMARY ($(date))"
        echo "==================================================================="
        
        # Calculate runtime more efficiently
        START_TIME=$(stat -c %Y "${LOG_DIR}/napls3_qc_run.log" 2>/dev/null || echo $(date +%s))
        END_TIME=$(date +%s)
        TOTAL_MINS=$(( (END_TIME - START_TIME) / 60 ))
        HOURS=$(( TOTAL_MINS / 60 ))
        MINS=$(( TOTAL_MINS % 60 ))
        
        echo "Total runtime: ${HOURS}h ${MINS}m (${TOTAL_MINS} minutes)"
        echo ""
        
        # Document pipeline versions and parameters
        echo "Pipeline Configuration:"
        echo "- Build: GRCh37"
        echo "- dbSNP version: 156"
        echo "- Ancestry filtering: European (EUR)"
        echo "- Input subjects: $(wc -l < "${NAPLS3_DIR}/NAPLS3_n710.fam") individuals"
        
        # Get final subject count more robustly
        STATS_FILE=$(find "${PART3_DIR}/output_all/" -name "*_basic_stats_postQC.txt" -type f 2>/dev/null | head -1)
        if [[ -f "$STATS_FILE" ]]; then
            FINAL_COUNT=$(tail -1 "$STATS_FILE" | awk '{print $2+$3}')
            echo "- Final subjects: $FINAL_COUNT individuals" 
        else
            echo "- Final subjects: Unknown (stats file not found)"
        fi
        echo ""
        
        echo "Output Locations:"
        echo "- SNP Renamed Files: ${RENAMED_PREFIX}.[bed,bim,fam]"
        echo "- QC Part 1 Results: ${PART1_DIR}"
        echo "- QC Part 2 Results: ${PART2_DIR}"
        echo "- QC Part 3 Results: ${PART3_DIR}"
        [[ -n "$OUTPUT_ZIP" ]] && echo "- Final ZIP Package: $OUTPUT_ZIP"
        [[ -n "$QC3_FILES" ]] && echo "- Final QC3 Files: $(dirname "$QC3_FILES")/$(basename "$QC3_FILES" .bed).[bed,bim,fam]"
        echo ""
        
        echo "Final QC Summary:"
        QC3_SUMMARY=$(find "${PART3_DIR}/output_all/" -name "*_QC3_summary.txt" -type f 2>/dev/null | head -1)
        if [[ -f "$QC3_SUMMARY" ]]; then
            cat "$QC3_SUMMARY"
        else
            echo "QC3 summary file not found."
        fi
        echo ""
        
        echo "Pipeline Completion Status:"
        for step in "rsid_binaries" "rename_snps" "enigma_qc_part1" "enigma_qc_part2" "enigma_qc_part3"; do
            if step_completed "$step"; then
                echo "- ${step}: COMPLETED"
            else
                echo "- ${step}: NOT COMPLETED"
            fi
        done
        echo "==================================================================="
    } | tee "${SUMMARY_FILE}" | tee -a "${LOG_DIR}/napls3_qc_run.log"

    # --- Clean up scratch directory if requested ---
    if [[ -d "$SCRATCH_DIR" && "${AUTO_CLEANUP:-no}" == "yes" ]]; then
        log "Cleaning up scratch directory: $SCRATCH_DIR"
        rm -rf "$SCRATCH_DIR" && success "Scratch directory removed"
    else
        log "Scratch directory preserved at: $SCRATCH_DIR"
        log "To clean up manually, run: rm -rf $SCRATCH_DIR"
    fi

    success "NAPLS3 Genomic Data QC Pipeline Completed Successfully"
    return 0
}

# Execute main workflow
main "$@"
exit $?