Open in github.dev
Open in a new github.dev tab
Open in codespace
3 months agoMar 19, 2025
Expand file tree
/
Copy pathrun_napls_qc.sh
t
Latest commit
Open commit details
executable file
·410 lines (361 loc) · 16.4 KB
/
run_napls_qc.sh
File metadata and controls
executable file
·410 lines (361 loc) · 16.4 KB
Ask Copilot about this file
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
#!/bin/bash
# =============================================================================
# NAPLS3 Genomic Data QC Pipeline - Master Orchestration Script
# Individual scripts in /gprep/processed_genotype can also be run sequentially
# =============================================================================
# Purpose: Coordinates the complete QC workflow for NAPLS3 genomic data through
# the ENIGMA-DTI pre-imputation QC pipeline (with SNP renaming).
#
# Workflow stages:
# 1. Create rsid binaries - One-time setup of dbSNP156 binary files for variant mapping
# 2. Rename SNPs - Convert variants to standard rsID naming
# 3. ENIGMA-DTI QC - Three-part QC protocol for DTI genetic analysis:
# - Part 1: Initial filtering, sex checks, phenotype coding
# - Part 2: Relatedness analysis, HapMap merging, MDS analysis for ancestry
# - Part 3: PCA covariate generation, statistics collection, final packaging
#
# Usage:
# qsub run_napls_qc.sh
#
# Required inputs:
# - NAPLS3 genotype data (.bed/.bim/.fam) in $NAPLS3_DIR
# - Phenotype and DTI data in $INFO_DIR (see subscripts)
#
# Dependencies:
# - PLINK 1.9, PLINK 2.0
# - rsid_tools (https://github.com/HTGenomeAnalysisUnit/rsid_tools)
# - R with required packages (see part3 script)
# =============================================================================
# --- Job parameters [adjust as needed]---
#$ -cwd
#$ -l h_rt=3:00:00,h_data=2G
#$ -N NAPLS_QC_Pipeline
#$ -j y
#$ -o "$HOME/project-cbearden/napls/gprep/processed_genotype/logs/$JOB_ID_napls_qc_master.log"
# --- Set up environment ---
# Base project directories [adjust as needed]
export NAPLS3_DIR="/u/project/cbearden/hughesdy/NAPLS/raw_genotype/NAPLS3"
export PLINK2="/u/project/cbearden/hughesdy/software/plink2/plink2"
export PLINK19="/u/project/cbearden/hughesdy/software/plinkv1.9/plink"
export RSID_TOOLS="$HOME/apps/rsid_tools/bin/rsid_tools"
# Job project directories [adjust as needed]
export WORK_DIR="/u/home/c/cobeaman/project-cbearden/napls/gprep/processed_genotype"
export SCRATCH_DIR="/u/scratch/c/cobeaman/napls_qc_$JOB_ID"
export LOG_DIR="${WORK_DIR}/logs/${JOB_ID}_napls_qc_master"
export ENIGMA_DIR="${WORK_DIR}/enigma/DTIgenetics"
export INFO_DIR="${ENIGMA_DIR}/info"
export RS_BIN_DIR="${HOME}/scratch/GRCh37_dbSNP156_Binaries/Standard"
export STATE_FILE="${LOG_DIR}/pipeline_state.txt"
export CHECKPOINT_FILE="${LOG_DIR}/napls3_qc_checkpoint.txt"
export SUMMARY_FILE="${WORK_DIR}/napls3_qc_pipeline_summary.txt"
# --- Functions ---
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] - $1" | tee -a "${LOG_DIR}/napls3_qc_run.log"; }
err() { log "ERROR: $1"; return 1; }
warn() { log "WARNING: $1"; }
success() { log "SUCCESS: $1"; }
confirm_file() { [[ -s "$1" ]] || err "Required file not found or empty: $1"; }
set_state() { echo "$1" > "$STATE_FILE"; }
get_state() { [[ -f "$STATE_FILE" ]] && cat "$STATE_FILE" || echo "init"; }
step_completed() { grep -q "^COMPLETED: ${1}$" "${CHECKPOINT_FILE}" 2>/dev/null; }
mark_completed() { echo "COMPLETED: ${1}" >> "${CHECKPOINT_FILE}"; log "Checkpoint saved: ${1}"; }
# Find function to cache results and avoid repetitive filesystem operations
cached_find() {
local dir="$1" pattern="$2" type="$3" sort_opt="$4" count="$5"
local cache_key="${dir//\//_}_${pattern//\//_}_${type}_${sort_opt}_${count}"
local cache_file="${SCRATCH_DIR}/cache_${cache_key}"
if [[ -f "$cache_file" ]]; then
cat "$cache_file"
else
mkdir -p "$(dirname "$cache_file")"
if [[ "$sort_opt" == "time" ]]; then
find "$dir" -maxdepth 1 -name "$pattern" -type "$type" -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -"${count:-1}" | cut -d' ' -f2- | tee "$cache_file"
else
find "$dir" -maxdepth 1 -name "$pattern" -type "$type" 2>/dev/null | head -"${count:-1}" | tee "$cache_file"
fi
fi
}
run_step() {
local script="$1" description="$2" timeout="${3:-7200}" state_name="$4"
local current_state=$(get_state)
# Check if this step should be skipped
if step_completed "$state_name"; then
log "SKIPPING: $description (already completed)"
return 0
fi
# Mark step as in progress
set_state "${state_name}_in_progress"
# Run the step with timeout protection
log "STARTING: $description"
if timeout "$timeout" bash "$script" 2>&1 | tee -a "${LOG_DIR}/napls3_qc_run.log"; then
if [[ ${PIPESTATUS[0]} -eq 0 ]]; then
log "COMPLETED: $description"
set_state "${state_name}_completed"
mark_completed "$state_name"
return 0
else
err "$description failed with exit code ${PIPESTATUS[0]}. See log for details."
return 1
fi
else
local exit_code=$?
if [[ $exit_code -eq 124 ]]; then
err "$description timed out after $timeout seconds"
else
err "$description failed with exit code $exit_code"
fi
return 1
fi
}
verify_prerequisites() {
log "Verifying prerequisites..."
local missing=0
# Check required commands with specific version requirements where applicable
local cmds=("$PLINK19" "$PLINK2" "$RSID_TOOLS" "timeout" "bc")
for cmd in "${cmds[@]}"; do
command -v "$cmd" >/dev/null 2>&1 || { err "Command not found: $cmd"; missing=1; }
done
# Check input files (all 3 PLINK files must exist)
for ext in bed bim fam; do
confirm_file "${NAPLS3_DIR}/NAPLS3_n710.${ext}" || { missing=1; }
done
# Essential data files for ENIGMA QC
mkdir -p "$INFO_DIR"
for req_file in "${INFO_DIR}/NAPLS3_Terra_samplestab_phenofile.txt" "${INFO_DIR}/napls3_MS_diffusion.csv"; do
if [[ ! -f "$req_file" ]]; then
warn "Required file for ENIGMA QC not found: $req_file"
warn "Please ensure this file exists before running the ENIGMA QC steps."
fi
done
return $missing
}
# Function to handle directory validation
verify_directory() {
local dir_var="$1" dir_name="$2" required="$3"
local dir_path="${!dir_var}"
if [[ ! -d "$dir_path" ]]; then
if [[ "$required" == "required" ]]; then
err "$dir_name directory not found at: $dir_path"
return 1
else
warn "$dir_name directory not found at: $dir_path"
if [[ "$required" == "create" ]]; then
log "Creating $dir_name directory: $dir_path"
mkdir -p "$dir_path" || { err "Failed to create $dir_name directory"; return 1; }
success "Created $dir_name directory: $dir_path"
fi
fi
fi
return 0
}
# Create essential directories with error handling
setup_directories() {
log "Setting up directory structure..."
local dirs=(
"WORK_DIR:work:create"
"SCRATCH_DIR:scratch:create"
"LOG_DIR:log:create"
"ENIGMA_DIR:ENIGMA:create"
"INFO_DIR:phenotype info:create"
)
for dir_info in "${dirs[@]}"; do
IFS=':' read -r dir_var dir_name required <<< "$dir_info"
verify_directory "$dir_var" "$dir_name" "$required" || return 1
done
# Create checkpoint file
touch "$CHECKPOINT_FILE" || { err "Failed to create checkpoint file"; return 1; }
return 0
}
# Graceful handling of unexpected termination
cleanup() {
log "Pipeline interrupted or terminated. Saving state for resume capability."
# Don't delete anything - allow for resumption
exit 1
}
# --- Main workflow ---
main() {
# Set up traps for signal handling
trap cleanup SIGHUP SIGINT SIGTERM
log "Starting NAPLS3 Genomic Data QC Pipeline"
# Create directory structure
setup_directories || { err "Failed to setup directories"; exit 1; }
# Verify prerequisites
verify_prerequisites || {
err "Prerequisite check failed. Fix errors before proceeding."
log "TIP: Ensure all required software and input files are available and accessible."
exit 1
}
# --- STAGE 1: Binary creation (one-time setup) ---
if ! step_completed "rsid_binaries"; then
log "Step 1 of 5: Creating RSID binaries (one-time setup)"
if [[ -d "$RS_BIN_DIR" && -f "${RS_BIN_DIR}/GRCh37_1.hash2rsid.bin" ]]; then
log "RSID binaries already exist at ${RS_BIN_DIR} (skipping creation)"
mark_completed "rsid_binaries"
else
run_step "${WORK_DIR}/01_create_rsid_binaries.sh" "Creation of RSID binary files" 28800 "rsid_binaries" || {
err "Failed to create RSID binaries. Please check logs at ${LOG_DIR}"
exit 1
}
fi
else
log "Step 1 of 5: RSID binaries creation already completed (skipping)"
fi
# --- STAGE 2: SNP Renaming ---
if ! step_completed "rename_snps"; then
log "Step 2 of 5: Renaming SNPs to standard rsIDs"
run_step "${WORK_DIR}/01_rename_snps_direct.sh" "SNP renaming process" 7200 "rename_snps" || {
err "SNP renaming failed. Please check logs at ${LOG_DIR}"
exit 1
}
else
log "Step 2 of 5: SNP renaming already completed (skipping)"
fi
# Find the most recent renamed genotype files more efficiently
RENAMED_PREFIX=$(cached_find "${WORK_DIR}" "NAPLS3_n710_renamed_*[0-9]*.bed" "f" "time" 1)
RENAMED_PREFIX="${RENAMED_PREFIX%.bed}"
if [[ -z "$RENAMED_PREFIX" ]]; then
err "No renamed genotype files found after SNP renaming step. Pipeline cannot continue."
log "TIP: Check if the SNP renaming step completed successfully and produced output files."
exit 1
fi
log "Using renamed genotype files: ${RENAMED_PREFIX}"
# --- STAGE 3: ENIGMA-DTI QC Part 1 ---
if ! step_completed "enigma_qc_part1"; then
log "Step 3 of 5: ENIGMA-DTI QC Part 1 (Initial filtering, sex checks, phenotype coding)"
run_step "${WORK_DIR}/02_enigma_dti_qc_napls3_part1.sh" "ENIGMA-DTI QC Part 1" 3600 "enigma_qc_part1" || {
err "ENIGMA-DTI QC Part 1 failed. Please check logs and fix issues before continuing."
exit 1
}
else
log "Step 3 of 5: ENIGMA-DTI QC Part 1 already completed (skipping)"
fi
# Find Part 1 output directory with optimized search
PART1_DIR=$(cached_find "${ENIGMA_DIR}" "*_enigma_dti_qc_napls3_part1" "d" "time" 1)
if [[ -z "$PART1_DIR" ]]; then
err "Part 1 output directory not found. Cannot proceed to next step."
log "TIP: Check if Part 1 completed successfully and created its output directory."
exit 1
fi
log "Using Part 1 results from: $PART1_DIR"
# Verify Part 1 outputs with a more efficient check
PART1_QC1_FILE=$(cached_find "${PART1_DIR}" "*_QC1.bed" "f" "" 1)
if [[ -z "$PART1_QC1_FILE" ]]; then
err "Critical Part 1 output files (*_QC1.bed) not found in ${PART1_DIR}"
log "TIP: Check Part 1 logs for errors that may have prevented output generation."
exit 1
fi
log "Found QC1 dataset: ${PART1_QC1_FILE%.bed}.[bed,bim,fam]"
# --- STAGE 4: ENIGMA-DTI QC Part 2 ---
if ! step_completed "enigma_qc_part2"; then
log "Step 4 of 5: ENIGMA-DTI QC Part 2 (Relatedness checks, HapMap merging, MDS analysis)"
run_step "${WORK_DIR}/02_enigma_dti_qc_napls3_part2.sh" "ENIGMA-DTI QC Part 2" 3600 "enigma_qc_part2" || {
err "ENIGMA-DTI QC Part 2 failed. Please check logs and fix issues before continuing."
exit 1
}
else
log "Step 4 of 5: ENIGMA-DTI QC Part 2 already completed (skipping)"
fi
# Find Part 2 output directory
PART2_DIR=$(cached_find "${ENIGMA_DIR}" "*_enigma_dti_qc_napls3_part2" "d" "time" 1)
if [[ -z "$PART2_DIR" ]]; then
err "Part 2 output directory not found. Cannot proceed to next step."
log "TIP: Check if Part 2 completed successfully and created its output directory."
exit 1
fi
log "Using Part 2 results from: $PART2_DIR"
# Verify Part 2 outputs
PART2_QC3_FILE=$(cached_find "${PART2_DIR}" "*_QC3.bed" "f" "" 1)
if [[ -z "$PART2_QC3_FILE" ]]; then
err "Critical Part 2 output files (*_QC3.bed) not found in ${PART2_DIR}"
log "TIP: Check Part 2 logs for errors that may have prevented output generation."
exit 1
fi
log "Found QC3 dataset: ${PART2_QC3_FILE%.bed}.[bed,bim,fam]"
# --- STAGE 5: ENIGMA-DTI QC Part 3 ---
if ! step_completed "enigma_qc_part3"; then
log "Step 5 of 5: ENIGMA-DTI QC Part 3 (PCA covariates, summary statistics, packaging)"
run_step "${WORK_DIR}/02_enigma_dti_qc_napls3_part3.sh" "ENIGMA-DTI QC Part 3" 3600 "enigma_qc_part3" || {
err "ENIGMA-DTI QC Part 3 failed. Please check logs for details."
log "TIP: Check Part 3 logs for specific error messages."
exit 1
}
else
log "Step 5 of 5: ENIGMA-DTI QC Part 3 already completed (skipping)"
fi
# Find Part 3 output directory
PART3_DIR=$(cached_find "${ENIGMA_DIR}" "*_enigma_dti_qc_napls3_part3" "d" "time" 1)
if [[ -z "$PART3_DIR" ]]; then
err "Part 3 output directory not found. Final results may be missing."
log "TIP: Check if Part 3 completed successfully and created its output directory."
exit 1
fi
log "Using Part 3 results from: $PART3_DIR"
# --- Generate comprehensive pipeline summary ---
OUTPUT_ZIP=$(cached_find "${PART3_DIR}" "*_ENIGMA-DTI_FilesToSend.zip" "f" "" 1)
QC3_FILES=$(cached_find "${PART3_DIR}" "*_QC3.bed" "f" "" 1)
{
echo "==================================================================="
echo "NAPLS3 QC PIPELINE SUMMARY ($(date))"
echo "==================================================================="
# Calculate runtime more efficiently
START_TIME=$(stat -c %Y "${LOG_DIR}/napls3_qc_run.log" 2>/dev/null || echo $(date +%s))
END_TIME=$(date +%s)
TOTAL_MINS=$(( (END_TIME - START_TIME) / 60 ))
HOURS=$(( TOTAL_MINS / 60 ))
MINS=$(( TOTAL_MINS % 60 ))
echo "Total runtime: ${HOURS}h ${MINS}m (${TOTAL_MINS} minutes)"
echo ""
# Document pipeline versions and parameters
echo "Pipeline Configuration:"
echo "- Build: GRCh37"
echo "- dbSNP version: 156"
echo "- Ancestry filtering: European (EUR)"
echo "- Input subjects: $(wc -l < "${NAPLS3_DIR}/NAPLS3_n710.fam") individuals"
# Get final subject count more robustly
STATS_FILE=$(find "${PART3_DIR}/output_all/" -name "*_basic_stats_postQC.txt" -type f 2>/dev/null | head -1)
if [[ -f "$STATS_FILE" ]]; then
FINAL_COUNT=$(tail -1 "$STATS_FILE" | awk '{print $2+$3}')
echo "- Final subjects: $FINAL_COUNT individuals"
else
echo "- Final subjects: Unknown (stats file not found)"
fi
echo ""
echo "Output Locations:"
echo "- SNP Renamed Files: ${RENAMED_PREFIX}.[bed,bim,fam]"
echo "- QC Part 1 Results: ${PART1_DIR}"
echo "- QC Part 2 Results: ${PART2_DIR}"
echo "- QC Part 3 Results: ${PART3_DIR}"
[[ -n "$OUTPUT_ZIP" ]] && echo "- Final ZIP Package: $OUTPUT_ZIP"
[[ -n "$QC3_FILES" ]] && echo "- Final QC3 Files: $(dirname "$QC3_FILES")/$(basename "$QC3_FILES" .bed).[bed,bim,fam]"
echo ""
echo "Final QC Summary:"
QC3_SUMMARY=$(find "${PART3_DIR}/output_all/" -name "*_QC3_summary.txt" -type f 2>/dev/null | head -1)
if [[ -f "$QC3_SUMMARY" ]]; then
cat "$QC3_SUMMARY"
else
echo "QC3 summary file not found."
fi
echo ""
echo "Pipeline Completion Status:"
for step in "rsid_binaries" "rename_snps" "enigma_qc_part1" "enigma_qc_part2" "enigma_qc_part3"; do
if step_completed "$step"; then
echo "- ${step}: COMPLETED"
else
echo "- ${step}: NOT COMPLETED"
fi
done
echo "==================================================================="
} | tee "${SUMMARY_FILE}" | tee -a "${LOG_DIR}/napls3_qc_run.log"
# --- Clean up scratch directory if requested ---
if [[ -d "$SCRATCH_DIR" && "${AUTO_CLEANUP:-no}" == "yes" ]]; then
log "Cleaning up scratch directory: $SCRATCH_DIR"
rm -rf "$SCRATCH_DIR" && success "Scratch directory removed"
else
log "Scratch directory preserved at: $SCRATCH_DIR"
log "To clean up manually, run: rm -rf $SCRATCH_DIR"
fi
success "NAPLS3 Genomic Data QC Pipeline Completed Successfully"
return 0
}
# Execute main workflow
main "$@"
exit $?