@@ -42808,7 +42808,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
42808
42808
}
42809
42809
}
42810
42810
42811
- MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42811
+ MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
42812
42812
{
42813
42813
ma_uint64 iSample;
42814
42814
@@ -43103,10 +43103,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
43103
43103
sampleCount = frameCount * channels;
43104
43104
43105
43105
if (volume == 1) {
43106
+ #pragma clang loop vectorize(enable)
43106
43107
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43107
43108
pDst[iSample] += pSrc[iSample];
43108
43109
}
43109
43110
} else {
43111
+ #pragma clang loop vectorize(enable)
43110
43112
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43111
43113
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
43112
43114
}
@@ -45407,7 +45409,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
45407
45409
const float a1 = pBQ->a1.f32;
45408
45410
const float a2 = pBQ->a2.f32;
45409
45411
45410
- MA_ASSUME(channels > 0);
45412
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45413
+ #pragma clang loop unroll(disable)
45411
45414
for (c = 0; c < channels; c += 1) {
45412
45415
float r1 = pBQ->pR1[c].f32;
45413
45416
float r2 = pBQ->pR2[c].f32;
@@ -45439,7 +45442,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
45439
45442
const ma_int32 a1 = pBQ->a1.s32;
45440
45443
const ma_int32 a2 = pBQ->a2.s32;
45441
45444
45442
- MA_ASSUME(channels > 0);
45445
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45446
+ #pragma clang loop unroll(disable)
45443
45447
for (c = 0; c < channels; c += 1) {
45444
45448
ma_int32 r1 = pBQ->pR1[c].s32;
45445
45449
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45713,22 +45717,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
45713
45717
return MA_SUCCESS;
45714
45718
}
45715
45719
45716
- static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45720
+ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 * pLPF, float * pY, const float * pX)
45717
45721
{
45718
45722
ma_uint32 c;
45719
45723
const ma_uint32 channels = pLPF->channels;
45720
45724
const float a = pLPF->a.f32;
45721
45725
const float b = 1 - a;
45722
45726
45723
- MA_ASSUME(channels > 0);
45727
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45728
+ #pragma clang loop unroll(disable)
45724
45729
for (c = 0; c < channels; c += 1) {
45725
45730
float r1 = pLPF->pR1[c].f32;
45726
- float x = pX[c];
45731
+ float x = pX[c];
45727
45732
float y;
45728
45733
45729
- y = b* x + a* r1;
45734
+ y = b * x + a * r1;
45730
45735
45731
- pY[c] = y;
45736
+ pY[c] = y;
45732
45737
pLPF->pR1[c].f32 = y;
45733
45738
}
45734
45739
}
@@ -45740,7 +45745,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
45740
45745
const ma_int32 a = pLPF->a.s32;
45741
45746
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
45742
45747
45743
- MA_ASSUME(channels > 0);
45748
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45749
+ #pragma clang loop unroll(disable)
45744
45750
for (c = 0; c < channels; c += 1) {
45745
45751
ma_int32 r1 = pLPF->pR1[c].s32;
45746
45752
ma_int32 x = pX[c];
@@ -46593,7 +46599,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
46593
46599
const float a = 1 - pHPF->a.f32;
46594
46600
const float b = 1 - a;
46595
46601
46596
- MA_ASSUME(channels > 0 );
46602
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46597
46603
for (c = 0; c < channels; c += 1) {
46598
46604
float r1 = pHPF->pR1[c].f32;
46599
46605
float x = pX[c];
@@ -46613,7 +46619,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
46613
46619
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
46614
46620
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
46615
46621
46616
- MA_ASSUME(channels > 0 );
46622
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46617
46623
for (c = 0; c < channels; c += 1) {
46618
46624
ma_int32 r1 = pHPF->pR1[c].s32;
46619
46625
ma_int32 x = pX[c];
@@ -48721,6 +48727,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48721
48727
ma_uint64 iFrame;
48722
48728
ma_uint32 iChannel;
48723
48729
ma_uint64 interpolatedFrameCount;
48730
+ const ma_uint32 channels = pGainer->config.channels;
48724
48731
48725
48732
MA_ASSERT(pGainer != NULL);
48726
48733
@@ -48760,12 +48767,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48760
48767
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
48761
48768
float d = 1.0f / pGainer->config.smoothTimeInFrames;
48762
48769
48763
- if (pGainer->config. channels <= 32) {
48770
+ if (channels <= 32) {
48764
48771
float pRunningGain[32];
48765
48772
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
48766
48773
48767
48774
/* Initialize the running gain. */
48768
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48775
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48769
48776
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
48770
48777
pRunningGainDelta[iChannel] = t * d;
48771
48778
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48774,7 +48781,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48774
48781
iFrame = 0;
48775
48782
48776
48783
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48777
- if (pGainer->config. channels == 2) {
48784
+ if (channels == 2) {
48778
48785
#if defined(MA_SUPPORT_SSE2)
48779
48786
if (ma_has_sse2()) {
48780
48787
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48822,6 +48829,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48822
48829
48823
48830
iFrame = unrolledLoopCount << 1;
48824
48831
#else
48832
+ #pragma clang loop vectorize(enable)
48825
48833
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48826
48834
for (iChannel = 0; iChannel < 2; iChannel += 1) {
48827
48835
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48833,7 +48841,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48833
48841
}
48834
48842
#endif
48835
48843
}
48836
- } else if (pGainer->config. channels == 6) {
48844
+ } else if (channels == 6) {
48837
48845
#if defined(MA_SUPPORT_SSE2)
48838
48846
if (ma_has_sse2()) {
48839
48847
/*
@@ -48866,6 +48874,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48866
48874
} else
48867
48875
#endif
48868
48876
{
48877
+ #pragma clang loop vectorize(enable)
48869
48878
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48870
48879
for (iChannel = 0; iChannel < 6; iChannel += 1) {
48871
48880
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48877,7 +48886,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48877
48886
}
48878
48887
}
48879
48888
}
48880
- } else if (pGainer->config. channels == 8) {
48889
+ } else if (channels == 8) {
48881
48890
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
48882
48891
#if defined(MA_SUPPORT_SSE2)
48883
48892
if (ma_has_sse2()) {
@@ -48897,6 +48906,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48897
48906
#endif
48898
48907
{
48899
48908
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48909
+ #pragma clang loop vectorize(enable)
48900
48910
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48901
48911
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48902
48912
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -48910,17 +48920,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48910
48920
}
48911
48921
}
48912
48922
48923
+ #pragma clang loop unroll(disable)
48913
48924
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48914
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48915
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48925
+ #pragma clang loop vectorize(enable)
48926
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48927
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
48916
48928
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48917
48929
}
48918
48930
}
48919
48931
} else {
48920
48932
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48933
+ #pragma clang loop unroll(disable)
48921
48934
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48922
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48923
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48935
+ #pragma clang loop vectorize(enable)
48936
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48937
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48924
48938
}
48925
48939
48926
48940
a += d;
@@ -48939,18 +48953,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48939
48953
48940
48954
/* All we need to do here is apply the new gains using an optimized path. */
48941
48955
if (pFramesOut != NULL && pFramesIn != NULL) {
48942
- if (pGainer->config. channels <= 32) {
48956
+ if (channels <= 32) {
48943
48957
float gains[32];
48944
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48958
+ #pragma clang loop unroll(disable)
48959
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48945
48960
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48946
48961
}
48947
48962
48948
- ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config. channels, gains);
48963
+ ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
48949
48964
} else {
48950
48965
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48966
+ #pragma clang loop unroll(disable)
48951
48967
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48952
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48953
- ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48968
+ #pragma clang loop vectorize(enable)
48969
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48970
+ ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48954
48971
}
48955
48972
}
48956
48973
}
@@ -51320,7 +51337,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
51320
51337
51321
51338
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
51322
51339
51323
- MA_ASSUME(channels > 0 );
51340
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51324
51341
for (c = 0; c < channels; c += 1) {
51325
51342
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
51326
51343
pFrameOut[c] = s;
@@ -51339,7 +51356,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
51339
51356
51340
51357
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
51341
51358
51342
- MA_ASSUME(channels > 0 );
51359
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51343
51360
for (c = 0; c < channels; c += 1) {
51344
51361
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
51345
51362
pFrameOut[c] = s;
@@ -52574,6 +52591,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
52574
52591
ma_uint64 iFrame;
52575
52592
ma_uint32 iChannelOut;
52576
52593
52594
+ #pragma clang loop unroll(disable)
52577
52595
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52578
52596
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52579
52597
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52594,6 +52612,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
52594
52612
ma_uint64 iFrame;
52595
52613
ma_uint32 iChannelOut;
52596
52614
52615
+ #pragma clang loop unroll(disable)
52597
52616
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52598
52617
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52599
52618
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52636,6 +52655,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
52636
52655
ma_uint64 iFrame;
52637
52656
ma_uint32 iChannelOut;
52638
52657
52658
+ #pragma clang loop unroll(disable)
52639
52659
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52640
52660
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52641
52661
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52656,6 +52676,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
52656
52676
ma_uint64 iFrame;
52657
52677
ma_uint32 iChannelOut;
52658
52678
52679
+ #pragma clang loop unroll(disable)
52659
52680
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52660
52681
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52661
52682
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52890,6 +52911,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52890
52911
} else
52891
52912
#endif
52892
52913
{
52914
+ #pragma clang loop vectorize(enable)
52893
52915
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52894
52916
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
52895
52917
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -52917,6 +52939,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52917
52939
} else
52918
52940
#endif
52919
52941
{
52942
+ #pragma clang loop vectorize(enable)
52920
52943
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52921
52944
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
52922
52945
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -52934,6 +52957,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52934
52957
} else
52935
52958
#endif
52936
52959
{
52960
+ #pragma clang loop vectorize(enable)
52937
52961
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52938
52962
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
52939
52963
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -66257,7 +66281,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
66257
66281
ma_uint64 iFrame;
66258
66282
ma_uint32 iChannel;
66259
66283
const ma_uint32 channels = pNoise->config.channels;
66260
- MA_ASSUME(channels > 0 );
66284
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66261
66285
66262
66286
if (pNoise->config.format == ma_format_f32) {
66263
66287
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66376,7 +66400,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
66376
66400
ma_uint64 iFrame;
66377
66401
ma_uint32 iChannel;
66378
66402
const ma_uint32 channels = pNoise->config.channels;
66379
- MA_ASSUME(channels > 0 );
66403
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66380
66404
66381
66405
if (pNoise->config.format == ma_format_f32) {
66382
66406
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66458,7 +66482,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
66458
66482
ma_uint64 iFrame;
66459
66483
ma_uint32 iChannel;
66460
66484
const ma_uint32 channels = pNoise->config.channels;
66461
- MA_ASSUME(channels > 0 );
66485
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66462
66486
66463
66487
if (pNoise->config.format == ma_format_f32) {
66464
66488
float* pFramesOutF32 = (float*)pFramesOut;
0 commit comments