-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEmblask.nf
executable file
·1481 lines (1145 loc) · 84 KB
/
Emblask.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
include { mapLRtoAsm as mapLRtoAsm_1; mapLRtoAsm as mapLRtoAsm_2 } from './module/main.nf'
include { var_CallFilterPhase as var_CallFilterPhase_1; var_CallFilterPhase as var_CallFilterPhase_2 } from './module/main.nf'
include { varCall_hifi as hapResAsmPolish_phase_varCall } from './module/main.nf'
include { varCall_ontR9_trainedModels as mixHapAsm_filter_call; varCall_ontR9_trainedModels as hapResAsmPolish_polishDualAsm_call } from './module/main.nf'
include { varPhase as mixHapAsm_filter_phase; varPhase as hapResAsmPolish_phase_varPhase1; varPhase as hapResAsmPolish_phase_varPhase2 } from './module/main.nf'
include { mapPairedIllumina as mapPairedIllumina_correct; mapPairedIllumina as mapPairedIllumina_polish } from './module/main.nf'
include { mergePairedIlluminaBAM as mergePairedIlluminaBAM_correct; mergePairedIlluminaBAM as mergePairedIlluminaBAM_polish } from './module/main.nf'
/*
Basic quality filtering on corrected ONT reads stored in FASTQ or BAM.
By default, the script detects and removes windows >= 100 bp for which >90% of the bases have QUAL<=5.
Hence N50 of the filtered reads is decreased but quality is increased.
Output reads are compressed.
*/
process filterONT {
label 'small_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
path bam_in
path fastq_in
output:
path "lr.${task.process}.fq.gz"
script:
if (bam_in) {
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
pigz=\${PIGZ_BIN:-${params.tools.pigz.bin}}
filter_fastq_py=\${FILTER_FASTQ_PY:-${params.python.script.filter_fastq}}
MAX_BQ=\$(bc -l <<< \"${params.pipeline.fastq_filter.min_baseq_ratio} * ${params.max_lr_bq}\" | awk '{printf(\"%.0f\", \$0)}')
\${samtools} bam2fq -n -@ ${task.cpus} ${bam_in} | \${pigz} -p ${task.cpus} -c > lr.fq.gz;
python \${filter_fastq_py} -f lr.fq.gz -t ${task.cpus} -c 1000 -l ${params.pipeline.min_len_read} -b \${MAX_BQ} \
-w ${params.pipeline.fastq_filter.len_window_baseq_filter} -r ${params.pipeline.fastq_filter.ratio_low_baseq_window_filter} | \
\${pigz} -p ${task.cpus} -c > lr.${task.process}.fq.gz;
rm -rf reads.fq.gz;
"""
}
else if (fastq_in) {
"""
pigz=\${PIGZ_BIN:-${params.tools.pigz.bin}}
filter_fastq_py=\${FILTER_FASTQ_PY:-${params.python.script.filter_fastq}}
MAX_BQ=\$(bc -l <<< \"${params.pipeline.fastq_filter.min_baseq_ratio} * ${params.max_lr_bq}\" | awk '{printf(\"%.0f\", \$0)}')
python \${filter_fastq_py} -f ${fastq_in} -t ${task.cpus} -c 1000 -l ${params.pipeline.min_len_read} -b \${MAX_BQ} \
-w ${params.pipeline.fastq_filter.len_window_baseq_filter} -r ${params.pipeline.fastq_filter.ratio_low_baseq_window_filter} | \
\${pigz} -p ${task.cpus} -c > lr.${task.process}.fq.gz;
"""
}
else error("No corrected reads in BAM or FASTQ format provided in input")
}
/*
Extract (unpaired) Illumina reads from a BAM file.
Build k=31 de Bruijn graph and Bifrost index from Illumina reads.
Compress output.
*/
process extractIllumina_buildDBG {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
tuple val(trio_member), path(bam_in), path(fastq_in) // trio_member can be 'proband', 'father' or 'mother'
output:
tuple val(trio_member), path("sr.${task.process}.fq.gz"), path("dbg.${task.process}.gfa.gz"), path("dbg.${task.process}.bfi")
script:
if (bam_in) {
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
bifrost=\${BIFROST:-${params.tools.bifrost.bin}}
\${samtools} fastq -@ ${task.cpus} -o sr.${task.process}.fq.gz ${bam_in};
\${bifrost} build -v -k 31 -t ${task.cpus} -s sr.${task.process}.fq.gz -o dbg.${task.process};
"""
}
else if (fastq_in) {
"""
bifrost=\${BIFROST:-${params.tools.bifrost.bin}}
ln -s ${fastq_in} sr.${task.process}.fq.gz
\${bifrost} build -v -k 31 -t ${task.cpus} -s sr.${task.process}.fq.gz -o dbg.${task.process};
"""
}
else error("No corrected reads in BAM or FASTQ format provided in input")
}
/*
Extract interleaved Illumina reads from a BAM file.
Compress output.
*/
process extractPairedIllumina {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
path(sr_bam_in)
output:
path "sr.${task.process}.fq.gz"
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
\${samtools} collate -@ ${task.cpus} -u -O -n 128 ${sr_bam_in} /tmp/collate | \${samtools} fastq -o sr.${task.process}.fq.gz -;
"""
}
/*
Split a compressed FASTQ file of Illumina reads into uncompressed chunks of
80M lines or 10M pairs.
*/
process splitPairedIllumina {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
path sr_fq
output:
path "sr.${task.process}.fq.part_*"
"""
pigz=\${PIGZ_BIN:-${params.tools.pigz.bin}}
\${pigz} -p ${task.cpus} -d -c ${sr_fq} | split -l 80000000 -d - sr.${task.process}.fq.part_;
"""
}
/*
Create a mixed-haplotype assembly with scaffolds from the proband's corrected long reads
using subsampling if coverage is too high. Change assembly multi-lines FASTA to single-lines
FASTA. Index FASTA and create BED file of contigs. Remove tmp files.
*/
process mixHapAsm {
label 'large_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
path filtered_lr_fq
output:
tuple path("asm.${task.process}.fasta"), path("asm.${task.process}.fasta.fai"), path("asm.${task.process}.bed")
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
flye=\${FLYE:-${params.tools.flye.bin}}
seqkit=\${SEQKIT:-${params.tools.seqkit.bin}}
FLYE_SUBSAMPLE_PARAM=\"\";
if [ ${params.genome_size} -gt 0 ]; then FLYE_SUBSAMPLE_PARAM=\"--asm-coverage ${params.pipeline.assembly.subsampling_cov} --genome-size ${params.genome_size}\"; fi;
\${flye} --nano-corr ${filtered_lr_fq} -o . -t ${task.cpus} \${FLYE_SUBSAMPLE_PARAM} --read-error ${params.pipeline.assembly.max_error_rate.lenient} --scaffold --extra-params max_bubble_length=300000;
if [ ! \$? -eq 0 ]; then echo \"Flye has exited prematurely\" 1>&2; exit 1; fi;
ASM_COMPLETED=\$({ grep \"INFO: Final assembly\" <(tail -n 50 flye.log) || true; } | wc -l);
if [ \${ASM_COMPLETED} -eq 0 ] || [ ! -s assembly.fasta ]; then echo \"Flye log incomplete or empty assembly\" 1>&2; exit 1; fi;
\${seqkit} seq -w0 assembly.fasta > asm.${task.process}.fasta
\${samtools} faidx asm.${task.process}.fasta
awk '{print \$1 \"\\t0\\t\" \$2}' asm.${task.process}.fasta.fai | sort -k1,1 -k2,2n > asm.${task.process}.bed
rm -rf assembly.fasta 00-assembly 10-consensus 20-repeat 30-contigger 40-polishing;
"""
}
/*
Filter the mixed-haplotype assembly using the haplotagged remapping of proband's corrected long reads to it.
Create BED file of contigs with low coverage (less than 60% of mean assembly coverage).
List low coverage contigs with less than 2 "good quality" heterozygous variants.
List low coverage contigs for which the phase coverage is highly unbalanced (H1 or H2 coverage >= 80% of contig coverage).
Any contig falling in one of the 2 previous categories is flagged as a potential haplotig (haploid contig) and is removed
from the mixed-haplotype assembly.
*/
process mixHapAsm_filter {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
tuple path('lr.asm.hap.bam'), path('lr.asm.hap.bam.bai')
tuple path('asm.fasta'), path('asm.fasta.fai'), path('asm.bed')
tuple path('lr.asm.vcf.gz'), path('lr.asm.vcf.gz.tbi')
path asm_cov
output:
tuple path("asm.${task.process}.fasta"), path("asm.${task.process}.fasta.fai"), path("asm.${task.process}.bed")
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
bcftools=\${BCFTOOLS:-${params.tools.bcftools.bin}}
seqtk=\${SEQTK:-${params.tools.seqtk.bin}}
get_phase_cov_py=\${GET_PHASE_COV_PY:-${params.python.script.get_phase_cov}}
MEAN_COV=\$(tail -n 1 ${asm_cov} | cut -f3)
head -n -1 ${asm_cov} | awk -v meancov=\$MEAN_COV '{if (\$3<0.6*meancov) {print \$1}}' > lr.asm.low_cov.list
while read CONTIG_NAME
do
awk -v contig=\"\${CONTIG_NAME}\" '{if (\$1==contig) {print \$0}}' asm.bed;
done < lr.asm.low_cov.list > lr.asm.low_cov.bed
\${bcftools} query -i \"(FILTER=='PASS') && ((GT=='AR') & (GQ>=${params.pipeline.variant_filter.min_gq}) & (AD>=${params.pipeline.variant_filter.min_ad}) & \
(VAF>=${params.pipeline.variant_filter.min_vaf}) & (VAF<=${params.pipeline.variant_filter.max_vaf}))\" -f '%CHROM\\n' lr.asm.vcf.gz | sort | uniq -c > pepper.var.count
while read CONTIG_NAME
do
VARIANT_COUNT=\$(awk -v contig=\"\${CONTIG_NAME}\" '{if (\$2==contig) {print \$1}}' pepper.var.count)
if [ -z \${VARIANT_COUNT} ] || [ \${VARIANT_COUNT} -lt 2 ]; then echo -e \"\${CONTIG_NAME}\"; fi
done < lr.asm.low_cov.list > haplotig.low_het_var_count.list
python \${get_phase_cov_py} -t ${task.cpus} -b lr.asm.hap.bam -m ${params.pipeline.min_mapq.strict} -r lr.asm.low_cov.bed > MARGIN_PHASED.haplotagged.cov.list
awk '{COV=\$4+\$5+\$6; COV_THRESHOLD=0.8*COV; if ((\$5>=COV_THRESHOLD) || (\$6>=COV_THRESHOLD)) {print \$1}}' MARGIN_PHASED.haplotagged.cov.list > haplotig.unbalanced_phase.list
cat haplotig.low_het_var_count.list haplotig.unbalanced_phase.list | sort | uniq > haplotigs.list; comm -23 <(awk '{print \$1}' asm.bed | sort -k1,1) haplotigs.list > contig.collapsed.list
\${seqtk} subseq asm.fasta contig.collapsed.list > asm.${task.process}.fasta
\${samtools} faidx asm.${task.process}.fasta
awk '{print \$1 \"\\t0\\t\" \$2}' asm.${task.process}.fasta.fai | sort -k1,1 -k2,2n > asm.${task.process}.bed
"""
}
/*
Prepare local correction:
First, a BED file of high coverage (hc) regions and low-medium coverage (lmc) regions is created. Regions are max 200kb.
Later, each job will process 2Mbp worth of hc-lmc regions so hc and lmc BED files are split accordingly.
This process makes the assumption that less than 10 billion jobs will be used (:D) so all files IDs are encoded using 9 digits
*/
process prepLocalCorrection {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
tuple path('lr.bam'), path('lr.bam.bai')
tuple path('sr.bam'), path('sr.bam.bai')
output:
path("split/lr.corrected.?????????.*") // Should include all the lr.corrected.?????????.hc.bed, split/lr.corrected.?????????.lmc.bed, split/lr.corrected.?????????.bed, split/lr.corrected.?????????.hc.dict
"""
bedtools=\${BEDTOOLS:-${params.tools.bedtools.bin}}
local_ratatosk_sh=\${RATATOSK_LOCAL_CORRECTION:-${params.scripts.local_ratatosk}}
mkdir -p split
bash \${local_ratatosk_sh} -t ${task.cpus} -c -l lr.bam -s sr.bam -o lr.corrected
cat <(sed 's/\$/\\t1/' lr.corrected.hc.bed) <(sed 's/\$/\\t2/' lr.corrected.lmc.bed) | sort -k1,1 -k2,2n | \
awk 'BEGIN {SUM=0; ID=0; FS=\"\\t\"; OFS=\"\\t\"} {print \$1,\$2,\$3,\$4,ID,\"lr.corrected.\" ID \".bed\"; SUM+=\$3-\$2; \
if (SUM>=${params.pipeline.correction.job_len_region}) {ID+=1; SUM=0;}}' > lr.corrected.split.bed
cd split
awk '{print>>\$6}' ../lr.corrected.split.bed
ID_MAX=\$(tail -n 1 ../lr.corrected.split.bed | awk '{print \$5}')
rm -rf lr.corrected.split.bed
for i in \$(seq 0 \${ID_MAX})
do
ID_JOB=\$(printf \"%09d\" \$i)
awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"} {if (\$4==1){print \$1,\$2,\$3}}' lr.corrected.\${i}.bed > lr.corrected.\${ID_JOB}.hc.bed
awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"} {if (\$4==2){print \$1,\$2,\$3}}' lr.corrected.\${i}.bed > lr.corrected.\${ID_JOB}.lmc.bed
rm -rf lr.corrected.\${i}.bed
\${bedtools} merge -i <(cat lr.corrected.\${ID_JOB}.hc.bed lr.corrected.\${ID_JOB}.lmc.bed | sort -k1,1 -k2,2n) > lr.corrected.\${ID_JOB}.bed
join ../lr.corrected.dict <(cut -f1 lr.corrected.\${ID_JOB}.bed | sort | uniq) | awk '{print \$1 \"\\t\" \$2}' > lr.corrected.\${ID_JOB}.dict
done
"""
}
/*
Performs localized long read correction with short reads using successive calls to Ratatosk with increasing k on segments of 200kb.
Output is compressed.
*/
process localCorrection {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
tuple val(id), path("*")
tuple path('lr.bam'), path('lr.bam.bai')
tuple path('sr.bam'), path('sr.bam.bai')
output:
path "${id}.lr.corr.fastq.gz"
"""
pigz=\${PIGZ_BIN:-${params.tools.pigz.bin}}
local_ratatosk_sh=\${RATATOSK_LOCAL_CORRECTION:-${params.scripts.local_ratatosk}}
if [ -s lr.corrected.${id}.bed ]
then
bash \${local_ratatosk_sh} -t ${task.cpus} -u -Q ${params.max_lr_bq} -l lr.bam -s sr.bam -C lr.corrected.${id} -o ${id}.lr.corr
if [ ! \$? -eq 0 ]; then echo \"Something went wrong during the local correction of a long read chunk\" 1>&2; exit 1; fi
\${pigz} -p ${task.cpus} ${id}.lr.corr.fastq
\${pigz} -p ${task.cpus} -t ${id}.lr.corr.fastq.gz
if [ ! \$? -eq 0 ]; then echo \"File ${id}.lr.corr.fastq.gz is malformed\" 1>&2; exit 1; fi
fi
"""
}
/*
Merge locally corrected long reads into one file with no duplicates (duplicate should not occur but better be safe than sorry).
Add unampped reads (left globally-corrected but not locally-corrected) to the file.
Compress output.
*/
process mergeLocalCorrection {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
path lr_chunks_fq_gz
tuple path('lr.asm.bam'), path('lr.asm.bam.bai')
output:
path "lr.${task.process}.fastq.gz"
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
pigz=\${PIGZ_BIN:-${params.tools.pigz.bin}}
seqkit=\${SEQKIT:-${params.tools.seqkit.bin}}
cat *.fastq.gz > lr.corrected.fastq.gz
\${seqkit} rmdup -j ${task.cpus} -n -o lr.corrected.nodup.fastq lr.corrected.fastq.gz
rm -rf lr.corrected.fastq.gz
\${pigz} -p ${task.cpus} lr.corrected.nodup.fastq
mv -f lr.corrected.nodup.fastq.gz lr.${task.process}.fastq.gz
\${samtools} view -@ ${task.cpus} -b -f 4 lr.asm.bam | \${samtools} bam2fq -n -@ ${task.cpus} - 2> /dev/null | \${pigz} -p ${task.cpus} -c >> lr.${task.process}.fastq.gz
\${pigz} -p ${task.cpus} -t lr.${task.process}.fastq.gz
if [ ! \$? -eq 0 ]; then echo \"File lr.${task.process}.fastq.gz is malformed\" 1>&2; exit 1; fi
"""
}
/*
Create a haplotype-resolved assembly with scaffolds from the proband's globally & locally corrected
long reads using subsampling if coverage is too high. Change assembly multi-lines FASTA to single-lines
FASTA. Index FASTA and create BED file of contigs. Remove tmp files.
*/
process hapResAsm {
label 'large_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
path filtered_lr_fq_gz
output:
tuple path("asm.${task.process}.fasta"), path("asm.${task.process}.fasta.fai"), path("asm.${task.process}.bed")
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
flye=\${FLYE:-${params.tools.flye.bin}}
seqkit=\${SEQKIT:-${params.tools.seqkit.bin}}
FLYE_SUBSAMPLE_PARAM=\"\"
if [ ${params.genome_size} -gt 0 ]; then FLYE_SUBSAMPLE_PARAM=\"--asm-coverage ${params.pipeline.assembly.subsampling_cov} --genome-size ${params.genome_size}\"; fi
\${flye} --nano-hq ${filtered_lr_fq_gz} -o . -t ${task.cpus} --keep-haplotypes --read-error ${params.pipeline.assembly.max_error_rate.strict} --scaffold \${FLYE_SUBSAMPLE_PARAM}
if [ ! \$? -eq 0 ]; then echo \"Flye has exited prematurely\" 1>&2; exit 1; fi
ASM_COMPLETED=\$( { grep \"INFO: Final assembly\" <(tail -n 50 flye.log) || true; } | wc -l)
if [ \${ASM_COMPLETED} -eq 0 ] || [ ! -s assembly.fasta ]; then echo \"Flye log incomplete or empty assembly\" 1>&2; exit 1; fi
\${seqkit} seq -w0 assembly.fasta > asm.${task.process}.fasta;
\${samtools} faidx asm.${task.process}.fasta
awk '{print \$1 \"\\t0\\t\" \$2}' asm.${task.process}.fasta.fai | sort -k1,1 -k2,2n > asm.${task.process}.bed
rm -rf assembly.fasta 00-assembly 10-consensus 20-repeat 30-contigger 40-polishing
"""
}
/*
Filter out haplotigs from the haplotype-resolved assembly which are either too short or too low coverage
*/
process hapResAsm_filter {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
path lr_fq_gz
tuple path(asm_fa), path(asm_fai), path(asm_bed)
output:
tuple path("asm.${task.process}.fasta"), path("asm.${task.process}.fasta.fai"), path("asm.${task.process}.bed")
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
minimap2=\${MINIMAP2:-${params.tools.minimap2.bin}}
seqtk=\${SEQTK:-${params.tools.seqtk.bin}}
TASK_MEM=\$(echo -e \"${task.memory}\" | cut -d \" \" -f1)
MEM_PER_THREADS_SORT=\$(bc -l <<< \"(\${TASK_MEM} / ${task.cpus}) * ${params.tools.samtools.sort.mem_safety_ratio} * 1000\" | awk '{printf(\"%.0f\", \$0)}')
\${minimap2} -t ${task.cpus} ${params.tools.minimap2.param.ont_reads} -Y -I \$(du -L -BG ${asm_fa} | cut -f1) ${asm_fa} ${lr_fq_gz} > lr.asm.sam
\${samtools} sort -@ ${task.cpus} -m \${MEM_PER_THREADS_SORT}M lr.asm.sam > lr.asm.bam
if [ ! -s lr.asm.bam ]; then echo \"File lr.asm.bam does not exist or is empty\" 1>&2; exit 1; fi
\${samtools} quickcheck lr.asm.bam
if [ ! \$? -eq 0 ]; then echo \"File lr.asm.bam is malformed\" 1>&2; exit 1; fi
\${samtools} index -@ ${task.cpus} lr.asm.bam; rm -rf lr.asm.sam
\${samtools} depth -@ ${task.cpus} -J -Q ${params.pipeline.min_mapq_strict} -G ${params.pipeline.samtools.depth.filter_sec} lr.asm.bam | \
awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"; CONTIG=\"\"; POS_S=-1; POS_E=-1} {if (\$3>=${params.pipeline.variant_filter.min_depth}) {if ((\$1==CONTIG) && (\$2==POS_E)) {POS_E+=1} else \
{ if ((POS_S!=-1) && (POS_E-POS_S>=${params.pipeline.min_len_contig})) {print CONTIG, (POS_S-1), (POS_E-1)}; CONTIG=\$1; POS_S=\$2; POS_E=\$2+1;}}} \
END {if ((POS_S!=-1) && (POS_E-POS_S>=${params.pipeline.min_len_contig})) {print CONTIG, (POS_S-1), (POS_E-1)}}' > lr.asm.filtered.bed
\${seqtk} subseq ${asm_fa} lr.asm.filtered.bed | tr ':' '_' > asm.${task.process}.fasta
\${samtools} faidx asm.${task.process}.fasta
awk '{print \$1 \"\\t0\\t\" \$2}' asm.${task.process}.fasta.fai | sort -k1,1 -k2,2n > asm.${task.process}.bed
"""
}
/*
First polishing applied to filtered haplotype-resolved assembly of globally & locally corrected long reads (gllr)
First, gllr are mapped back to the assembly. A script filter the alignments, removing alignments with low MAPQ,
high error rates or short alignment length if mapping quality is not 60.
Using the filtered BAM file, coverage from primary alignments (P) and then primary+supplementary alignments (PS) is
computed per haplotig. A given haplotig is removed if PS-P>=2*P.
Using the mean read coverage COV on the mixed-haplotype assembly, we define a first set a lenient coverage boundaries
[0.5*(COV/2), 1.5*COV] and remove haplotigs outside of this coverage range.
Then using a coverage range of [0.75*COV, 1.25*COV] on the remaining haplotigs, we define a list of haplotigs which are
potentially collapsed (only one of the 2 haplotypes exist in the assembly or haplotig is homozygous for locus).
*/
process hapResAsm_polish1_1 {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
path lr_fq_gz
tuple path(asm_fa), path(asm_fai), path(asm_bed)
path lr_cov
output:
tuple path("lr.asm.${task.process}.bam"), path("lr.asm.${task.process}.bam.bai"), emit: bam
path "lr.asm.${task.process}.collapsed.bed", emit: bed
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
minimap2=\${MINIMAP2:-${params.tools.minimap2.bin}}
filter_align_py=\${FILTER_ALIGN_PY:-${params.python.script.filter_align}}
TASK_MEM=\$(echo -e \"${task.memory}\" | cut -d \" \" -f1)
MEM_PER_THREADS_SORT=\$(bc -l <<< \"(\${TASK_MEM} / ${task.cpus}) * ${params.tools.samtools.sort.mem_safety_ratio} * 1000\" | awk '{printf(\"%.0f\", \$0)}')
\${minimap2} -t ${task.cpus} -ax map-hifi -Y -I \$(du -L -BG ${asm_fa} | cut -f1) ${asm_fa} ${lr_fq_gz} > lr.asm.sam
\${samtools} sort -@ ${task.cpus} -m \${MEM_PER_THREADS_SORT}M lr.asm.sam > lr.asm.bam
if [ ! -s lr.asm.bam ]; then echo \"File lr.asm.bam does not exist or is empty\" 1>&2; exit 1; fi
\${samtools} quickcheck lr.asm.bam
if [ ! \$? -eq 0 ]; then echo \"File lr.asm.bam is malformed\" 1>&2; exit 1; fi
\${samtools} index -@ ${task.cpus} lr.asm.bam
rm -rf lr.asm.sam
python \${filter_align_py} -t ${task.cpus} -i lr.asm.bam -o lr.asm.tmp.bam -l ${params.pipeline.min_len_alignment} \
-e ${params.pipeline.assembly.max_error_rate.lenient} -m ${params.pipeline.min_mapq.strict}
mv -f lr.asm.tmp.bam lr.asm.bam
\${samtools} index -@ ${task.cpus} lr.asm.bam
\${samtools} depth -@ ${task.cpus} -aa -J -G ${params.pipeline.samtools.depth.filter_supp_sec} lr.asm.bam | \
awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"; CONTIG=\"\"; SUM=0; COUNT=0;} \
{if (\$1!=CONTIG) {if (CONTIG!=\"\") {MEAN=0; if (COUNT>=1) {MEAN=SUM/COUNT}; print CONTIG, MEAN}; CONTIG=\$1; SUM=0; COUNT=0}; SUM+=\$3; COUNT+=1} \
END {if (CONTIG!=\"\") {MEAN=0; if (COUNT>=1) {MEAN=SUM/COUNT}; print CONTIG, MEAN}}' | sort -k1,1 > lr.asm.prim.cov
\${samtools} depth -@ ${task.cpus} -aa -J -G ${params.pipeline.samtools.depth.filter_sec} lr.asm.bam | \
awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"; CONTIG=\"\"; SUM=0; COUNT=0;} \
{if (\$1!=CONTIG) {if (CONTIG!=\"\") {MEAN=0; if (COUNT>=1) {MEAN=SUM/COUNT}; print CONTIG, MEAN}; CONTIG=\$1; SUM=0; COUNT=0}; SUM+=\$3; COUNT+=1} \
END {if (CONTIG!=\"\") {MEAN=0; if (COUNT>=1) {MEAN=SUM/COUNT}; print CONTIG, MEAN}}' | sort -k1,1 > lr.asm.primsup.cov
join lr.asm.prim.cov lr.asm.primsup.cov | awk '{if ((\$3-\$2)<(2*\$2)){print \$1}}' > lr.asm.keep.1.list
join lr.asm.keep.1.list <(sort -k1,1 ${asm_bed}) | sort -k1,1 -k2,2n > lr.asm.keep.1.bed
\${samtools} view -@ ${task.cpus} -b -M -L lr.asm.keep.1.bed lr.asm.bam > lr.asm.filtered.bam
\${samtools} index -@ ${task.cpus} lr.asm.filtered.bam
COV=\$(tail -n 1 ${lr_cov} | awk '{printf(\"%.0f\\n\", \$3)}')
COLLAPSED_COV_MIN=\$(echo -e \"\${COV}\" | awk '{printf(\"%.0f\\n\", ${params.pipeline.ratio_cov.lower_bound.strict}*\$0)}')
COLLAPSED_COV_MAX=\$(echo -e \"\${COV}\" | awk '{printf(\"%.0f\\n\", ${params.pipeline.ratio_cov.upper_bound.strict}*\$0)}')
SUPER_COV_MIN=\$(echo -e \"\${COV}\" | awk '{printf(\"%.0f\\n\", ${params.pipeline.ratio_cov.lower_bound.lenient}*(\$0/2))}')
SUPER_COV_MAX=\$(echo -e \"\${COV}\" | awk '{printf(\"%.0f\\n\", ${params.pipeline.ratio_cov.upper_bound.lenient}*\$0)}')
\${samtools} depth -@ ${task.cpus} -J -Q ${params.pipeline.min_mapq.strict} -G ${params.pipeline.samtools.depth.filter_sec} lr.asm.filtered.bam | \
awk -v sup_min_cov=\${SUPER_COV_MIN} -v sup_max_cov=\${SUPER_COV_MAX} 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"; CONTIG=\"\"; POS_S=-1; POS_E=-1} \
{if ((\$3>=sup_min_cov) && (\$3<=sup_max_cov)) {if ((\$1==CONTIG) && (\$2==POS_E)) {POS_E+=1} else { if (POS_S!=-1) {print CONTIG, (POS_S-1), (POS_E-1)}; CONTIG=\$1; POS_S=\$2; POS_E=\$2+1;}}} \
END {if (POS_S!=-1) {print CONTIG, (POS_S-1), (POS_E-1)}}' > lr.asm.keep.2.bed
\${samtools} view -@ ${task.cpus} -b -M -L lr.asm.keep.2.bed lr.asm.filtered.bam > lr.asm.${task.process}.bam
\${samtools} index -@ ${task.cpus} lr.asm.${task.process}.bam
\${samtools} depth -@ ${task.cpus} -J -Q ${params.pipeline.min_mapq.strict} -G ${params.pipeline.samtools.depth.filter_sec} lr.asm.${task.process}.bam | \
awk -v min_cov=\${COLLAPSED_COV_MIN} -v max_cov=\${COLLAPSED_COV_MAX} 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"; CONTIG=\"\"; POS_S=-1; POS_E=-1} \
{if ((\$3>=min_cov) && (\$3<=max_cov)) {if ((\$1==CONTIG) && (\$2==POS_E)) {POS_E+=1} else { if (POS_S!=-1) {print CONTIG, (POS_S-1) , (POS_E-1)}; CONTIG=\$1; POS_S=\$2; POS_E=\$2+1;}}} \
END {if (POS_S!=-1) {print CONTIG, (POS_S-1), (POS_E-1)}}' > lr.asm.${task.process}.collapsed.bed
"""
}
/*
Split the haplotigs where the phasing is inconsistent using the MARGIN phasing output.
Remap gllr to split haplotigs.
A script filter the alignments, removing alignments with low MAPQ, high error rates or short alignment length if mapping quality is not 60.
Then using a coverage range of [0.75*COV, 1.25*COV] on the remaining haplotigs, we define a list of haplotigs which are
potentially collapsed (for a given locus, only one of the 2 haplotypes exist in the assembly or haplotig is homozygous).
*/
process hapResAsm_polish1_2 {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
tuple path(asm_fa), path(asm_fai), path(asm_bed)
tuple path('lr.asm.bam'), path('lr.asm.bam.bai')
path 'MARGIN_PHASED.phaseset.bed'
path lr_cov
output:
tuple path("assembly.${task.process}.splitPS.fasta"), path("assembly.${task.process}.splitPS.fasta.fai"), path("assembly.${task.process}.splitPS.bed"), emit: asm
tuple path("lr.asm.${task.process}.splitPS.bam"), path("lr.asm.${task.process}.splitPS.bam.bai"), emit: bam
path "lr.asm.${task.process}.splitPS.collapsed.bed", emit: bed
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
minimap2=\${MINIMAP2:-${params.tools.minimap2.bin}}
seqtk=\${SEQTK:-${params.tools.seqtk.bin}}
pigz=\${PIGZ_BIN:-${params.tools.pigz.bin}}
filter_align_py=\${FILTER_ALIGN_PY:-${params.python.script.filter_align}}
TASK_MEM=\$(echo -e \"${task.memory}\" | cut -d \" \" -f1)
MEM_PER_THREADS_SORT=\$(bc -l <<< \"(\${TASK_MEM} / ${task.cpus}) * ${params.tools.samtools.sort.mem_safety_ratio} * 1000\" | awk '{printf(\"%.0f\", \$0)}')
COV=\$(tail -n 1 ${lr_cov} | awk '{printf(\"%.0f\\n\", \$3)}')
COLLAPSED_COV_MIN=\$(echo -e \"\${COV}\" | awk '{printf(\"%.0f\\n\", ${params.pipeline.ratio_cov.lower_bound.strict}*\$0)}')
COLLAPSED_COV_MAX=\$(echo -e \"\${COV}\" | awk '{printf(\"%.0f\\n\", ${params.pipeline.ratio_cov.upper_bound.strict}*\$0)}')
sort -k1,1 -k2,2n MARGIN_PHASED.phaseset.bed > MARGIN_PHASED.phaseset.tmp.bed
mv -f MARGIN_PHASED.phaseset.tmp.bed MARGIN_PHASED.phaseset.bed
awk '{if (\$4!=\"ContigEnd\"){print \$1}}' MARGIN_PHASED.phaseset.bed | cut -f1 | sort | uniq > MARGIN_PHASED.phaseset.inconsistent.list
join MARGIN_PHASED.phaseset.bed MARGIN_PHASED.phaseset.inconsistent.list | cut -d ' ' -f1,2,3 | tr ' ' '\\t' > MARGIN_PHASED.phaseset.inconsistent.bed
comm -13 MARGIN_PHASED.phaseset.inconsistent.list <(awk '{print \$1}' ${asm_bed} | sort -k1,1) > MARGIN_PHASED.phaseset.consistent.list
join MARGIN_PHASED.phaseset.inconsistent.bed <(sort -k1,1 ${asm_bed}) | cut -d ' ' -f1,2,3,4,5 | tr ' ' '\\t' | \
awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"; CHROM=\"\"; SPOS=0; EPOS=0; EPOS_CONTIG=0} \
{if (\$1!=CHROM){if (CHROM!=\"\"){print CHROM, SPOS, EPOS_CONTIG}; CHROM=\$1; SPOS=0; EPOS=\$3; EPOS_CONTIG=\$5} else \
{EPOS+=int((\$2-EPOS)/2); print CHROM, SPOS, EPOS; CHROM=\$1; SPOS=EPOS; EPOS=\$3; EPOS_CONTIG=\$5}} \
END {if (CHROM!=\"\"){print CHROM, SPOS, EPOS_CONTIG}}' > MARGIN_PHASED.phaseset.inconsistent.split.bed
cat <(\${seqtk} subseq ${asm_fa} MARGIN_PHASED.phaseset.consistent.list) <(\${seqtk} subseq ${asm_fa} MARGIN_PHASED.phaseset.inconsistent.split.bed) | \
awk '{if (NR%2==1){print \">contig_\" ID; ID+=1} else {print \$0}}' > assembly.${task.process}.splitPS.fasta
\${samtools} faidx assembly.${task.process}.splitPS.fasta
awk '{print \$1 \"\\t0\\t\" \$2}' assembly.${task.process}.splitPS.fasta.fai | sort -k1,1 -k2,2n > assembly.${task.process}.splitPS.bed
\${samtools} bam2fq -@ ${task.cpus} -n lr.asm.bam | \${pigz} -p ${task.cpus} -c > lr.asm.fq.gz
\${minimap2} -t ${task.cpus} -ax map-hifi -Y -I \$(du -L -BG assembly.${task.process}.splitPS.fasta | cut -f1) assembly.${task.process}.splitPS.fasta lr.asm.fq.gz > lr.asm.${task.process}.splitPS.sam
\${samtools} sort -@ ${task.cpus} -m \${MEM_PER_THREADS_SORT}M lr.asm.${task.process}.splitPS.sam > lr.asm.${task.process}.splitPS.bam
if [ ! -s lr.asm.${task.process}.splitPS.bam ]; then echo \"File lr.asm.${task.process}.splitPS.bam does not exist or is empty\" 1>&2; exit 1; fi
\${samtools} quickcheck lr.asm.${task.process}.splitPS.bam
if [ ! \$? -eq 0 ]; then echo \"File lr.asm.${task.process}.splitPS.bam is malformed\" 1>&2; exit 1; fi
\${samtools} index -@ ${task.cpus} lr.asm.${task.process}.splitPS.bam
rm -rf lr.asm.${task.process}.splitPS.sam lr.asm.fq.gz
python \${filter_align_py} -t ${task.cpus} -i lr.asm.${task.process}.splitPS.bam -o lr.asm.${task.process}.splitPS.tmp.bam -l ${params.pipeline.min_len_alignment} \
-e ${params.pipeline.assembly.max_error_rate.lenient} -m ${params.pipeline.min_mapq.strict}
mv -f lr.asm.${task.process}.splitPS.tmp.bam lr.asm.${task.process}.splitPS.bam
\${samtools} index -@ ${task.cpus} lr.asm.${task.process}.splitPS.bam
\${samtools} depth -@ ${task.cpus} -J -Q ${params.pipeline.min_mapq.strict} -G ${params.pipeline.samtools.depth.filter_sec} lr.asm.${task.process}.splitPS.bam | \
awk -v min_cov=\${COLLAPSED_COV_MIN} -v max_cov=\${COLLAPSED_COV_MAX} 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"; CONTIG=\"\"; POS_S=-1; POS_E=-1} \
{if ((\$3>=min_cov) && (\$3<=max_cov)) {if ((\$1==CONTIG) && (\$2==POS_E)) {POS_E+=1} else { if (POS_S!=-1) {print CONTIG, (POS_S-1), (POS_E-1)}; CONTIG=\$1; POS_S=\$2; POS_E=\$2+1;}}} \
END {if (POS_S!=-1) {print CONTIG, (POS_S-1), (POS_E-1)}}' > lr.asm.${task.process}.splitPS.collapsed.bed
"""
}
/*
Using a haplotagged BAM file of gllr mapping to the split haplotigs, extract a BAM of only the reads mapping
to the ref (hap-resolved assembly) allele.
For each phaseset coordinates, extract how many variants have the alt allele on H1 and H2. If a phaseset has
>25% of the alt on one haplotype and the rest on the other haplotype, the phaseset is suspicious and is flagged
as "corrupted".
Corrupted and uncorrupted regions are polished separately with the ref allele reads. This can create duplicated haplotigs.
Gllr are remapped to the polished haplotigs which are then filtered on min coverage and length.
To solve the duplicate issue, the polished haplotigs are mapped to themselves: if an haplotig maps to another haplotig with
MAPQ 0 and a gap-compressed per-base sequence divergence < 0.1%, haplotig is considered to be a duplicate and is removed.
*/
process hapResAsm_polish2 {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
tuple path(asm_fa), path(asm_fai), path(asm_bed)
tuple path('hap_lr_splitPS.bam'), path('hap_lr_splitPS.bam.bai')
tuple path('hap_lr_splitPS.vcf.gz'), path('hap_lr_splitPS.vcf.gz.tbi')
output:
tuple path("detectCorruptPS/ref-to-ref-map/asm.polished.${task.process}.fasta"), path("detectCorruptPS/ref-to-ref-map/asm.polished.${task.process}.fasta.fai"), path("detectCorruptPS/ref-to-ref-map/asm.polished.${task.process}.bed")
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
minimap2=\${MINIMAP2:-${params.tools.minimap2.bin}}
flye=\${FLYE:-${params.tools.flye.bin}}
seqkit=\${SEQKIT:-${params.tools.seqkit.bin}}
seqtk=\${SEQTK:-${params.tools.seqtk.bin}}
bedtools=\${BEDTOOLS:-${params.tools.bedtools.bin}}
pigz=\${PIGZ_BIN:-${params.tools.pigz.bin}}
extract_ref_reads_py=\${EXTRACT_REF_READS_PY:-${params.python.script.extract_ref_reads}}
get_ps_info_py=\${GET_PS_INFO_PY:-${params.python.script.get_ps_info}}
extract_reads_within_py=\${EXTRACT_READS_WITHIN_PY:-${params.python.script.extract_reads_within}}
TASK_MEM=\$(echo -e \"${task.memory}\" | cut -d \" \" -f1)
MEM_PER_THREADS_SORT=\$(bc -l <<< \"(\${TASK_MEM} / ${task.cpus}) * ${params.tools.samtools.sort.mem_safety_ratio} * 1000\" | awk '{printf(\"%.0f\", \$0)}')
python \${extract_ref_reads_py} -t ${task.cpus} -m ${params.pipeline.min_mapq.strict} -b hap_lr_splitPS.bam \
-v hap_lr_splitPS.vcf.gz -B MARGIN_PHASED.haplotagged.polish.bam --include_supplementary
\${samtools} sort -@ ${task.cpus} -m \${MEM_PER_THREADS_SORT}M MARGIN_PHASED.haplotagged.polish.bam > MARGIN_PHASED.haplotagged.polish.sorted.bam
mv -f MARGIN_PHASED.haplotagged.polish.sorted.bam MARGIN_PHASED.haplotagged.polish.bam
if [ ! -s MARGIN_PHASED.haplotagged.polish.bam ]; then echo \"File MARGIN_PHASED.haplotagged.polish.bam does not exist or is empty\" 1>&2; exit 1; fi
\${samtools} quickcheck MARGIN_PHASED.haplotagged.polish.bam
if [ ! \$? -eq 0 ]; then echo \"File MARGIN_PHASED.haplotagged.polish.bam is malformed\" 1>&2; exit 1; fi
\${samtools} index -@ ${task.cpus} MARGIN_PHASED.haplotagged.polish.bam
mkdir -p detectCorruptPS
cd detectCorruptPS
python \${get_ps_info_py} -v ../hap_lr_splitPS.vcf.gz | \
awk '{SUM=\$5+\$6; if (((\$5>\$6) && (\$5/SUM<0.75)) || ((\$6>=\$5) && (\$6/SUM<0.75))) {print \$1 \"\\t\" \$2 \"\\t\" \$3}}' > MARGIN_PHASED.corrupt_ps.bed
\${bedtools} subtract -a ../${asm_bed} -b MARGIN_PHASED.corrupt_ps.bed | sort -k1,1 -k2,2n > MARGIN_PHASED.no_corrupt_ps.bed
if [ -s MARGIN_PHASED.corrupt_ps.bed ]
then
\${samtools} view -@ ${task.cpus} -b -M -L MARGIN_PHASED.corrupt_ps.bed ../MARGIN_PHASED.haplotagged.polish.bam > MARGIN_PHASED.haplotagged.polish.corruptPS.bam
\${samtools} index -@ ${task.cpus} MARGIN_PHASED.haplotagged.polish.corruptPS.bam
python \${extract_reads_within_py} -t ${task.cpus} -b ../MARGIN_PHASED.haplotagged.polish.bam -r MARGIN_PHASED.no_corrupt_ps.bed -B MARGIN_PHASED.haplotagged.polish.noCorruptPS.bam -s -S
\${samtools} index -@ ${task.cpus} MARGIN_PHASED.haplotagged.polish.noCorruptPS.bam
\${flye} --polish-target ../${asm_fa} --nano-hq MARGIN_PHASED.haplotagged.polish.corruptPS.bam -t ${task.cpus} -o flye.polish.corruptPS
\${seqkit} seq -w0 -m${params.pipeline.min_len_collapsed_segment} flye.polish.corruptPS/polished_1.fasta > flye.polish.corruptPS/polished_1.tmp.fasta
mv -f flye.polish.corruptPS/polished_1.tmp.fasta flye.polish.corruptPS/polished_1.fasta
\${flye} --polish-target ../${asm_fa} --nano-hq MARGIN_PHASED.haplotagged.polish.noCorruptPS.bam -t ${task.cpus} -o flye.polish.noCorruptPS
\${seqkit} seq -w0 -m${params.pipeline.min_len_collapsed_segment} flye.polish.noCorruptPS/polished_1.fasta > flye.polish.noCorruptPS/polished_1.tmp.fasta
mv -f flye.polish.noCorruptPS/polished_1.tmp.fasta flye.polish.noCorruptPS/polished_1.fasta
cat flye.polish.corruptPS/polished_1.fasta flye.polish.noCorruptPS/polished_1.fasta | awk 'BEGIN {ID=0} {if (substr(\$0,1,1)==\">\") {print \">contig_\" ID; ID+=1} else {print \$0}}' > asm.polished.fasta
else
\${flye} --polish-target ../${asm_fa} --nano-hq ../MARGIN_PHASED.haplotagged.polish.bam -t ${task.cpus} -o flye.polish.noCorruptPS
\${seqkit} seq -w0 -m${params.pipeline.min_len_collapsed_segment} flye.polish.noCorruptPS/polished_1.fasta > flye.polish.noCorruptPS/polished_1.tmp.fasta
mv -f flye.polish.noCorruptPS/polished_1.tmp.fasta flye.polish.noCorruptPS/polished_1.fasta
awk 'BEGIN {ID=0} {if (substr(\$0,1,1)==\">\") {print \">contig_\" ID; ID+=1} else {print \$0}}' flye.polish.noCorruptPS/polished_1.fasta > asm.polished.fasta
fi
\${samtools} faidx asm.polished.fasta
awk '{print \$1 \"\\t0\\t\" \$2}' asm.polished.fasta.fai | sort -k1,1 -k2,2n > asm.polished.bed
\${samtools} bam2fq -@ ${task.cpus} -n ../MARGIN_PHASED.haplotagged.polish.bam | \${pigz} -p ${task.cpus} -c > MARGIN_PHASED.haplotagged.polish.fastq.gz
\${minimap2} -t ${task.cpus} -ax map-hifi -Y -I \$(du -L -BG asm.polished.fasta | cut -f1) asm.polished.fasta MARGIN_PHASED.haplotagged.polish.fastq.gz > lr.asm.sam
\${samtools} sort -@ ${task.cpus} -m \${MEM_PER_THREADS_SORT}M lr.asm.sam > lr.asm.bam
if [ ! -s lr.asm.bam ]; then echo \"File lr.asm.bam does not exist or is empty\" 1>&2; exit 1; fi
\${samtools} quickcheck lr.asm.bam
if [ ! \$? -eq 0 ]; then echo \"File lr.asm.bam is malformed\" 1>&2; exit 1; fi
\${samtools} index -@ ${task.cpus} lr.asm.bam
rm -rf lr.asm.sam MARGIN_PHASED.haplotagged.polish.fastq.gz
\${samtools} depth -@ ${task.cpus} -aa -J -Q 0 -G ${params.pipeline.samtools.depth.filter_sec} lr.asm.bam | \
awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"; CONTIG=\"\"; POS_S=-1; POS_E=-1} \
{if (\$3<${params.pipeline.variant_filter.min_depth}) {if ((\$1==CONTIG) && (\$2==POS_E)) {POS_E+=1} else \
{ if ((POS_S!=-1) && (POS_E-POS_S>=${params.pipeline.min_len_collapsed_segment})) {print CONTIG, (POS_S-1), (POS_E-1)}; CONTIG=\$1; POS_S=\$2; POS_E=\$2+1;}}} \
END {if ((POS_S!=-1) && (POS_E-POS_S>=${params.pipeline.min_len_collapsed_segment})) {print CONTIG, (POS_S-1), (POS_E-1)}}' > lr.asm.discard.bed
\${bedtools} subtract -a asm.polished.bed -b lr.asm.discard.bed | sort -k1,1 -k2,2n > lr.asm.keep.bed
\${seqtk} subseq asm.polished.fasta lr.asm.keep.bed | tr ':' '_' > asm.polished.filtered.fasta
mkdir ref-to-ref-map; cd ref-to-ref-map
\${minimap2} -t ${task.cpus} ${params.tools.minimap2.param.asm} -I \$(du -L -BG ../asm.polished.filtered.fasta | cut -f1) ../asm.polished.filtered.fasta ../asm.polished.filtered.fasta > ref2ref.sam
\${samtools} sort -@ ${task.cpus} -m \${MEM_PER_THREADS_SORT}M ref2ref.sam > ref2ref.bam
if [ ! -s ref2ref.bam ]; then echo \"File ref2ref.bam does not exist or is empty\" 1>&2; exit 1; fi
\${samtools} quickcheck ref2ref.bam
if [ ! \$? -eq 0 ]; then echo \"File ref2ref.bam is malformed\" 1>&2; exit 1; fi
\${samtools} index -@ ${task.cpus} ref2ref.bam; rm -rf ref2ref.sam
\${samtools} view -@ ${task.cpus} -F 2048 --tag de ref2ref.bam | awk '{if ((\$5==0) && (\$1!=\$3)) {for (i=6; i<=NF; i+=1){ if (substr(\$i,1,5)==\"de:f:\") \
{ERR=substr(\$i,6,length(\$i)-5); if (ERR<=0.001) {print \$1}; break;}}}}' | sort | uniq > duplicates.list
awk '{if (substr(\$0,1,1)==\">\"){print substr(\$0,2,length(\$0)-1)}}' ../asm.polished.filtered.fasta | sort > contigs.all.list
comm -23 contigs.all.list duplicates.list > contigs.keep.list
\${seqtk} subseq ../asm.polished.filtered.fasta contigs.keep.list > asm.polished.fasta
\${seqtk} seq -L 3000 asm.polished.fasta > asm.polished.${task.process}.fasta
\${samtools} faidx asm.polished.${task.process}.fasta
awk '{print \$1 \"\\t0\\t\" \$2}' asm.polished.${task.process}.fasta.fai | sort -k1,1 -k2,2n > asm.polished.${task.process}.bed
"""
}
/*
A script filter the alignments, removing alignments with low MAPQ, high error rates or short alignment length if mapping quality is not 60.
*/
process hapResAsmPolish_phase_mapFilter {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
path lr_fq_gz
tuple path(asm_fa), path(asm_fai), path(asm_bed)
output:
tuple path("lr.asm.${task.process}.bam"), path("lr.asm.${task.process}.bam.bai")
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
minimap2=\${MINIMAP2:-${params.tools.minimap2.bin}}
filter_align_py=\${FILTER_ALIGN_PY:-${params.python.script.filter_align}}
TASK_MEM=\$(echo -e \"${task.memory}\" | cut -d \" \" -f1)
MEM_PER_THREADS_SORT=\$(bc -l <<< \"(\${TASK_MEM} / ${task.cpus}) * ${params.tools.samtools.sort.mem_safety_ratio} * 1000\" | awk '{printf(\"%.0f\", \$0)}')
\${minimap2} -t ${task.cpus} -ax map-hifi -Y -I \$(du -L -BG ${asm_fa} | cut -f1) ${asm_fa} ${lr_fq_gz} > lr.asm.sam
\${samtools} sort -@ ${task.cpus} -m \${MEM_PER_THREADS_SORT}M lr.asm.sam > lr.asm.bam
if [ ! -s lr.asm.bam ]; then echo \"File lr.asm.bam does not exist or is empty\" 1>&2; exit 1; fi
\${samtools} quickcheck lr.asm.bam
if [ ! \$? -eq 0 ]; then echo \"File lr.asm.bam is malformed\" 1>&2; exit 1; fi
\${samtools} index -@ ${task.cpus} lr.asm.bam
rm -rf lr.asm.sam
python \${filter_align_py} -t ${task.cpus} -i lr.asm.bam -o lr.asm.${task.process}.bam \
-l ${params.pipeline.min_len_alignment} -e ${params.pipeline.assembly.max_error_rate.lenient} -m ${params.pipeline.min_mapq.strict}
\${samtools} index -@ ${task.cpus} lr.asm.${task.process}.bam
rm -rf lr.asm.bam
"""
}
/*
Compute a BED file of high quality heterozygous SNP for each haplotig (if found, indicates the other haplotig for that locus is missing)
Compute the mean read coverage COV2 at those sites if site coverage falls within [0.75*COV, 1.25*COV] to avoid counting very high coverage
of haplotigs for which only one copy of large repeated region is present (or all copies are collapsed in one).
Create new list of regions where read coverage falls within [0.75*COV2, 1.25*COV2]: those are regions where reads of both haplotypes map
Extract high quality SNPs from those regions.
*/
process hapResAsmPolish_phase_varFilter {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
tuple path('lr.asm.bam'), path('lr.asm.bam.bai')
tuple path('lr.asm.vcf.gz'), path('lr.asm.vcf.gz.tbi')
path lr_cov
output:
tuple path("lr.asm.${task.process}.vcf.gz"), path("lr.asm.${task.process}.vcf.gz.tbi")
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
bcftools=\${BCFTOOLS:-${params.tools.bcftools.bin}}
\${bcftools} query -i \"(type=='snp') && (FILTER=='PASS') && ((GT=='AR') & (GQ>=${params.pipeline.variant_filter.min_gq}) & (AD>=${params.pipeline.variant_filter.min_ad}) & \
(VAF>=${params.pipeline.variant_filter.min_vaf}) & (VAF<=${params.pipeline.variant_filter.max_vaf}))\" -f '[%CHROM\\t%POS0\\t%END\n]' lr.asm.vcf.gz > lr.asm.hq.bed;
COLLAPSED_COV_MIN=\$(tail -n 1 ${lr_cov} | awk '{printf(\"%.0f\\n\", ${params.pipeline.ratio_cov.lower_bound.strict}*\$3)}');
COLLAPSED_COV_MAX=\$(tail -n 1 ${lr_cov} | awk '{printf(\"%.0f\\n\", ${params.pipeline.ratio_cov.upper_bound.strict}*\$3)}');
COV=\$(\${samtools} depth -@ ${task.cpus} -J -Q ${params.pipeline.min_mapq.strict} -G ${params.pipeline.samtools.depth.filter_sec} -b lr.asm.hq.bed lr.asm.bam | \
awk -v min_cov=\${COLLAPSED_COV_MIN} -v max_cov=\${COLLAPSED_COV_MAX} '{if ((\$3>=min_cov) && (\$3<=max_cov)) {SUM+=\$3; COUNT+=1}} END {if (COUNT==0){print \"0\"} else {printf(\"%.0f\\n\", SUM/COUNT)}}')
COLLAPSED_COV_MIN=\$(echo -e \"\${COV}\" | awk '{printf(\"%.0f\\n\", ${params.pipeline.ratio_cov.lower_bound.strict}*\$0)}');
COLLAPSED_COV_MAX=\$(echo -e \"\${COV}\" | awk '{printf(\"%.0f\\n\", ${params.pipeline.ratio_cov.upper_bound.strict}*\$0)}');
\${samtools} depth -@ ${task.cpus} -J -Q ${params.pipeline.min_mapq.strict} -G ${params.pipeline.samtools.depth.filter_sec} lr.asm.bam | \
awk -v min_cov=\${COLLAPSED_COV_MIN} -v max_cov=\${COLLAPSED_COV_MAX} 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"; CONTIG=\"\"; POS_S=-1; POS_E=-1} \
{if ((\$3>=min_cov) && (\$3<=max_cov)) {if ((\$1==CONTIG) && (\$2==POS_E)) {POS_E+=1} \
else { if (POS_S!=-1) {print CONTIG, (POS_S-1), (POS_E-1)}; CONTIG=\$1; POS_S=\$2; POS_E=\$2+1;}}} \
END {if (POS_S!=-1) {print CONTIG, (POS_S-1), (POS_E-1)}}' > lr.asm.collapsed.bed;
\${bcftools} view -i \"(type=='snp') && (FILTER=='PASS') && ((GT=='AR') & (GQ>=${params.pipeline.variant_filter.min_gq}) & (AD>=${params.pipeline.variant_filter.min_ad}) & \
(VAF>=${params.pipeline.variant_filter.min_vaf}) & (VAF<=${params.pipeline.variant_filter.max_vaf}))\" -R lr.asm.collapsed.bed -Oz lr.asm.vcf.gz | \
\${bcftools} sort -Oz -o lr.asm.${task.process}.vcf.gz
tabix -p vcf lr.asm.${task.process}.vcf.gz
"""
}
/*
Compute BED file of SNPs which are phased with their overlapping read coverage.
Compute mean coverage of these SNPs and coverage deviation around this mean coverage. These are global information.
Compute mean coverage of these SNPs and coverage deviation around this mean coverage per haplotig. These are local information.
For haplotigs with phased variants, compute regions where read coverage falls within local([STD-MEAN,STD+MEAN]).
For haplotigs with no phased variants, compute regions where read coverage falls within global([STD-MEAN,STD+MEAN]).
Extract SNPs overlapping these regions.
*/
process hapResAsmPolish_phase_varPhasedFilter {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
tuple path('lr.asm.unfilt.vcf.gz'), path('lr.asm.unfilt.vcf.gz.tbi')
tuple path('lr.asm.hap.bam'), path('lr.asm.hap.bam.bai')
tuple path('lr.asm.hap.filt.vcf.gz'), path('lr.asm.hap.filt.vcf.gz.tbi')
output:
tuple path("lr.asm.${task.process}.vcf.gz"), path("lr.asm.${task.process}.vcf.gz.tbi")
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
bcftools=\${BCFTOOLS:-${params.tools.bcftools.bin}}
\${bcftools} query -i \"(type=='snp') & (PS>=0)\" -f '%CHROM\\t%POS0\\t%END\n' lr.asm.hap.filt.vcf.gz > lr.asm.hap.filt.bed
\${samtools} depth -@ ${task.cpus} -a -J -Q ${params.pipeline.min_mapq.strict} -G ${params.pipeline.samtools.depth.filter_sec} -b lr.asm.hap.filt.bed lr.asm.hap.bam | \
awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"} {print \$1, \$2, (\$2+1), \$3}' > lr.asm.hap.filt.bed.tmp
mv -f lr.asm.hap.filt.bed.tmp lr.asm.hap.filt.bed
COV_STDEV=\$(awk 'BEGIN {i=1} {SUM+=\$4; COUNT+=1; SAMP[i]=\$4; i+=1} END {MEAN=SUM/COUNT; SUMSQ=0; for (i=1; i<=COUNT; i++) {SUMSQ+=(SAMP[i]-MEAN)^2}; print MEAN \"\\t\" sqrt(SUMSQ/COUNT)}' lr.asm.hap.filt.bed)
COV=\$(echo -e \"\${COV_STDEV}\" | awk '{print \$1}')
AAD_COV=\$(echo -e \"\${COV_STDEV}\" | awk '{print \$2}')
awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"} {if (\$1!=CONTIG){if (CONTIG!=\"\") {MEAN=SUM/COUNT; SUMSQ=0; for (i=1; i<=COUNT; i++) {SUMSQ+=(SAMP[i]-MEAN)^2}; \
print CONTIG, MEAN, sqrt(SUMSQ/COUNT)}; CONTIG=\$1; SUM=\$4; COUNT=1; i=1} else {SUM+=\$4; COUNT+=1}; SAMP[i]=\$4; i+=1} \
END {if (CONTIG!=\"\") {MEAN=SUM/COUNT; SUMSQ=0; for (i=1; i<=COUNT; i++) {SUMSQ+=(SAMP[i]-MEAN)^2}; print CONTIG, MEAN, sqrt(SUMSQ/COUNT)};}' lr.asm.hap.filt.bed > lr.asm.hap.filt.cov
cat lr.asm.hap.filt.cov <(\${samtools} depth -@ ${task.cpus} -J -Q ${params.pipeline.min_mapq.strict} -G ${params.pipeline.samtools.depth.filter_sec} lr.asm.hap.bam) | \
awk -v COV_GLOB=\$(echo -e \"\${COV}\" | awk '{printf(\"%.0f\\n\", \$0)}') -v AAD_GLOB=\$(echo -e \"\${AAD_COV}\" | awk '{printf(\"%.0f\\n\", \$0)}') -v NB_PHASED_HAP=\$(wc -l lr.asm.hap.filt.cov | cut -d ' ' -f1) \
'BEGIN {CONTIG=\"\"; POS_S=-1; POS_E=-1; COV_GLOB_SUP_LOW=(COV_GLOB-AAD_GLOB)*${params.pipeline.ratio_cov.lower_bound.lenient}; COV_GLOB_SUP_UP=(COV_GLOB+AAD_GLOB)*${params.pipeline.ratio_cov.upper_bound.lenient}} \
{if (NR<=NB_PHASED_HAP){COV_LOC[\$1]=\$2; AAD_LOC[\$1]=\$3} else {COV=COV_GLOB; AAD=AAD_GLOB; if (\$1 in COV_LOC){COV=COV_LOC[\$1]; AAD=AAD_LOC[\$1]}; \
if ((\$3>=(COV-AAD)*${params.pipeline.ratio_cov.lower_bound.strict}) && (\$3<=(COV+AAD)*${params.pipeline.ratio_cov.upper_bound.strict}) && (\$3>=COV_GLOB_SUP_LOW) && (\$3<=COV_GLOB_SUP_UP)) \
{if ((\$1==CONTIG) && (\$2==POS_E)) {POS_E+=1} else { if (POS_S!=-1) {print CONTIG \"\\t\" (POS_S-1) \"\\t\" (POS_E-1)}; CONTIG=\$1; POS_S=\$2; POS_E=\$2+1}}}} \
END {if (POS_S!=-1) {print CONTIG \"\\t\" (POS_S-1) \"\\t\" (POS_E-1)}}' > lr.asm.hap.filt.cov.bed
\${bcftools} view -i \"(type=='snp') && (FILTER=='PASS') && ((GT=='AR') & (GQ>=${params.pipeline.variant_filter.min_gq}) & (AD>=${params.pipeline.variant_filter.min_ad}) & \
(VAF>=${params.pipeline.variant_filter.min_vaf}) & (VAF<=${params.pipeline.variant_filter.max_vaf}))\" -R lr.asm.hap.filt.cov.bed -Oz lr.asm.unfilt.vcf.gz | \
\${bcftools} sort -Oz -o lr.asm.filt.vcf.gz; tabix -p vcf lr.asm.filt.vcf.gz;
\${bcftools} query -i \"type=='snp'\" -f \"%CHROM\\t%POS0\\t%END\n\" lr.asm.filt.vcf.gz > lr.asm.filt.bed;
\${samtools} mpileup -B -q ${params.pipeline.min_mapq.strict} -Q 0 -l lr.asm.filt.bed lr.asm.hap.bam | cut -f1,2,6 | \
awk 'BEGIN {for(n=0;n<256;n++){ord[sprintf(\"%c\",n)]=n}} \
{l=split(\$3,a,\"\"); SUM=0; for(i=1;i<=l;i++){SUM+=ord[a[i]]-33}; MEAN=0; if (l!=0){MEAN=SUM/l}; if (MEAN>=20){print \$1 \"\\t\" (\$2-1) \"\\t\" \$2}}' > lr.asm.filt.hq.bed
\${bcftools} view -i \"(type=='snp') && (FILTER=='PASS') && ((GT=='AR') & (GQ>=${params.pipeline.variant_filter.min_gq}) & (AD>=${params.pipeline.variant_filter.min_ad}) & \
(VAF>=${params.pipeline.variant_filter.min_vaf}) & (VAF<=${params.pipeline.variant_filter.max_vaf}))\" -R lr.asm.filt.hq.bed -Oz lr.asm.filt.vcf.gz | \
\${bcftools} sort -Oz -o lr.asm.${task.process}.vcf.gz
tabix -p vcf lr.asm.${task.process}.vcf.gz
"""
}
/*
Haplotagging can haplotag primary and supplementary alignments (using provided phasing configuration files).
Sometimes, for a given read, only one or more supplementary will be haplotagged (phased) but not the
primary. When this is the case, this process changes the primary into a supplementary and changes one of
the supplementary into a primary.
*/
process hapResAsmPolish_phase_selectBestPhasedAlignRec {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
tuple path('lr.asm.hap.bam'), path('lr.asm.hap.bam.bai')
output:
tuple path("lr_asm_hap.${task.process}.bam"), path("lr_asm_hap.${task.process}.bam.bai")
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
phased_supp2prim_py=\${PHASED_SUPP2PRIM_PY:-${params.python.script.phased_supp2prim}}
python \${phased_supp2prim_py} -t ${task.cpus} -b lr.asm.hap.bam -B lr_asm_hap.${task.process}.bam;
\${samtools} index -@ ${task.cpus} lr_asm_hap.${task.process}.bam;
"""
}
/*
Extract read alignments matching the alt allele of each phaseset.
Polish the assembly with those alignments, creating an alt haplotype-resolved assembly
Concat the alt assembly to the ref assembly
*/
process hapResAsmPolish_polishAlt {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
tuple path(asm_fa), path(asm_fai), path(asm_bed)
tuple path('hap_lr_polish.bam'), path('hap_lr_polish.bam.bai')
tuple path('hap_lr_polish.vcf.gz'), path('hap_lr_polish.vcf.gz.tbi')
output:
path "asm.${task.process}.fasta"
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
flye=\${FLYE:-${params.tools.flye.bin}}
seqkit=\${SEQKIT:-${params.tools.seqkit.bin}}
extract_ref_reads_py=\${EXTRACT_REF_READS_PY:-${params.python.script.extract_ref_reads}}
TASK_MEM=\$(echo -e \"${task.memory}\" | cut -d \" \" -f1);
MEM_PER_THREADS_SORT=\$(bc -l <<< \"(\${TASK_MEM} / ${task.cpus}) * ${params.tools.samtools.sort.mem_safety_ratio} * 1000\" | awk '{printf(\"%.0f\", \$0)}');
python \${extract_ref_reads_py} -t ${task.cpus} -b hap_lr_polish.bam -v hap_lr_polish.vcf.gz -B hap_lr_polish.alt.bam -p --alt
\${samtools} sort -@ ${task.cpus} -m \${MEM_PER_THREADS_SORT}M hap_lr_polish.alt.bam > hap_lr_polish.alt.sorted.bam
if [ ! -s hap_lr_polish.alt.sorted.bam ]; then echo \"File hap_lr_polish.alt.sorted.bam does not exist or is empty\" 1>&2; exit 1; fi
\${samtools} quickcheck hap_lr_polish.alt.sorted.bam
if [ ! \$? -eq 0 ]; then echo \"File hap_lr_polish.alt.sorted.bam is malformed\" 1>&2; exit 1; fi
mv -f hap_lr_polish.alt.sorted.bam hap_lr_polish.alt.bam
\${samtools} index -@ ${task.cpus} hap_lr_polish.alt.bam
\${flye} --polish-target ${asm_fa} --nano-hq hap_lr_polish.alt.bam -t ${task.cpus} -o flye.polish.alt
cat ${asm_fa} <(\${seqkit} seq -w0 -m${params.pipeline.min_len_collapsed_segment} flye.polish.alt/polished_1.fasta | \
awk '{if (substr(\$1,1,1)==\">\"){print \$1 \"_alt\"} else {print \$0}}') > asm.${task.process}.fasta
"""
}
process hapResAsmPolish_trioBin {
label 'medium_node'
shell '/bin/bash', '-euo', 'pipefail'
input:
tuple path('mixhap_asm.fasta'), path('mixhap_asm.fasta.fai'), path('mixhap_asm.bed')
tuple path('hapres_asm.fasta'), path('hapres_asm.fasta.fai'), path('hapres_asm.bed')
path hapres_asm_including_alt_fa
tuple path('hap_lr_polish.bam'), path('hap_lr_polish.bam.bai')
tuple path('hap_lr_polish.vcf.gz'), path('hap_lr_polish.vcf.gz.tbi')
tuple path('trio_binning_parentA.fq.gz'), path('trio_binning_parentA.gfa.gz'), path('trio_binning_parentA.bfi')
tuple path('trio_binning_parentB.fq.gz'), path('trio_binning_parentB.gfa.gz'), path('trio_binning_parentB.bfi')
output:
tuple path("trio_binning_h0.bed"), path("trio_binning_h1.bed"), path("trio_binning_h2.bed")
"""
samtools=\${SAMTOOLS:-${params.tools.samtools.bin}}
minimap2=\${MINIMAP2:-${params.tools.minimap2.bin}}
triobin=\${TRIO_BINNING:-${params.tools.trio_binning.bin}}
bin_ambiguous_py=\${BIN_AMBIGUOUS_HAP_PY:-${params.python.script.bin_ambiguous}}
detect_hap_collide_py=\${DETECT_HAP_COLLIDE_PY:-${params.python.script.detect_hap_collide}}
TASK_MEM=\$(echo -e \"${task.memory}\" | cut -d \" \" -f1);
MEM_PER_THREADS_SORT=\$(bc -l <<< \"(\${TASK_MEM} / ${task.cpus}) * ${params.tools.samtools.sort.mem_safety_ratio} * 1000\" | awk '{printf(\"%.0f\", \$0)}');
\${triobin} phase -t ${task.cpus} -v -f -S -U 1 -C ${hapres_asm_including_alt_fa} -r trio_binning -o trio_binning;
cat trio_binning.tsv | tr '_' '\\t' | sort -k1,1 -k2,2 -k3,3 | awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"} \
{if (\$4==\"alt\"){print \$1 \"_\" \$2 \"_\" \$3 \"_\" \$4,\$5,\$6,\$7,\$8} else {print \$1 \"_\" \$2 \"_\" \$3,\$4,\$5,\$6,\$7}}' > trio_binning.sorted.tsv;
mv -f trio_binning.sorted.tsv trio_binning.tsv;
awk 'BEGIN {LN=1; PREV=\"\"} { if (LN==1){LN1=\$0; PREV=\$1; LN=0} else if (substr(\$1,1,length(\$1)-4)==PREV) {print LN1; print \$0; LN=1} else {LN1=\$0; PREV=\$1; LN=0}}' trio_binning.tsv > trio_binning.paired.tsv;
awk 'BEGIN {LN=1; PREV=\"\"} { if (LN==1){LN1=\$0; PREV=\$1; LN=0} else if (substr(\$1,1,length(\$1)-4)==PREV) {LN=1} else {print LN1; LN1=\$0; PREV=\$1; LN=0}} END {if (LN==0){print LN1}}' trio_binning.tsv > trio_binning.unpaired.tsv;
awk '{SUM=\$3+\$4; if (NR%2==1){CONTIGR=\$0; FORCED_PHASING_R=0; LOW_RATIO_UNIQ_R=0; if (SUM==0) {HR=0; COUNTR=0} else if (\$3>=\$4) {HR=1; COUNTR=\$3} else {HR=2; COUNTR=\$4}; \
if (SUM!=0){ if (COUNTR/SUM<0.8){FORCED_PHASING_R=1}; if (COUNTR/(\$2+SUM)<0.0005){LOW_RATIO_UNIQ_R=1}}} else {CONTIGA=\$0; FORCED_PHASING_A=0; LOW_RATIO_UNIQ_A=0; if (SUM==0) {HA=0; COUNTA=0} \
else if (\$3>=\$4) {HA=1; COUNTA=\$3} else {HA=2; COUNTA=\$4}; if (SUM!=0) {if (COUNTA/SUM<0.8){FORCED_PHASING_A=1}; if (COUNTA/(\$2+SUM)<0.0005){LOW_RATIO_UNIQ_A=1}}; \
if (HR!=HA){if ((HR==0) && (HA!=0)) {print CONTIGA} else {print CONTIGR}} else if (((FORCED_PHASING_R==1) && (FORCED_PHASING_A==0)) || ((LOW_RATIO_UNIQ_R==1) && (COUNTA>=1.2*COUNTR)) || \
((FORCED_PHASING_R==0) && (FORCED_PHASING_A==0) && (COUNTA>=2*COUNTR))) {print CONTIGA} else {print CONTIGR}}}' trio_binning.paired.tsv > trio_binning.best_binning.tsv;
cat <({ grep -v alt trio_binning.best_binning.tsv || true; }) <({ grep alt trio_binning.best_binning.tsv || true; } | awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"} {print substr(\$1,1,length(\$1)-4),\$2,\$4,\$3,\$5}') trio_binning.unpaired.tsv | \
sort > trio_binning.best_binning.sorted.tsv;
mv -f trio_binning.best_binning.sorted.tsv trio_binning.best_binning.tsv;
mkdir -p binAmbiguous; cd binAmbiguous;
awk '{SUM=\$3+\$4; CONTIGR=\$1; FORCED_PHASING_R=0; LOW_RATIO_UNIQ_R=0; if (SUM==0) {HR=0; COUNTR=0} else if (\$3>=\$4) {HR=1; COUNTR=\$3} else {HR=2; COUNTR=\$4}; \
if (SUM!=0){ if (COUNTR/SUM<=0.6){FORCED_PHASING_R=1}; if (COUNTR/(\$2+SUM)<0.00025){LOW_RATIO_UNIQ_R=1}}; \
if ((HR!=0) && (FORCED_PHASING_R==0) && (LOW_RATIO_UNIQ_R==0)) {print CONTIGR \"\\t\" HR} else {print CONTIGR \"\\t0\"}}' ../trio_binning.best_binning.tsv > hap_bin.conf.list;
awk '{SUM=\$3+\$4; CONTIGR=\$1; FORCED_PHASING_R=0; LOW_RATIO_UNIQ_R=0; if (SUM==0) {HR=0; COUNTR=0} else if (\$3>=\$4) {HR=1; COUNTR=\$3} else \
{HR=2; COUNTR=\$4}; print CONTIGR \"\t\" HR}' ../trio_binning.best_binning.tsv > hap_bin.unconf.list;
python \${bin_ambiguous_py} -t ${task.cpus} -b ../hap_lr_polish.bam -v ../hap_lr_polish.vcf.gz -B hap_bin.conf.list > hap_bin.conf.1.list;
awk '{if (\$2!=0){print \$0}}' hap_bin.conf.1.list | sort -k1,1 > hap_bin.conf.1.binned.list;
awk '{if (\$2==0){print \$0}}' hap_bin.conf.1.list | sort -k1,1 > hap_bin.conf.1.unbinned.list;
cat hap_bin.conf.1.binned.list <(join hap_bin.conf.1.unbinned.list hap_bin.unconf.list | awk '{print \$1 \"\\t\" \$4 \"\\t\" \$3}') | sort -k1,1 > hap_bin.conf.2.all.list;
mkdir -p hap2mixed; cd hap2mixed;
\${minimap2} -t ${task.cpus} ${params.tools.minimap2.param.asm} -I \$(du -L -BG ../../mixhap_asm.fasta | cut -f1) ../../mixhap_asm.fasta ../../hapres_asm.fasta > asm_hap2mixed.sam;
\${samtools} sort -@ ${task.cpus} -m \${MEM_PER_THREADS_SORT}M asm_hap2mixed.sam > asm_hap2mixed.bam;
if [ ! -s asm_hap2mixed.bam ]; then echo \"File asm_hap2mixed.bam does not exist or is empty\" 1>&2; exit 1; fi;
\${samtools} quickcheck asm_hap2mixed.bam;