Skip to content

Commit 9660aff

Browse files
committed
clang: improve general performance with vectorization/unrolling
Clang has a tendency to *heavily* unroll loops all over the place: llvm/llvm-project#42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan <[email protected]>
1 parent 6f5657f commit 9660aff

File tree

1 file changed

+54
-30
lines changed

1 file changed

+54
-30
lines changed

miniaudio.h

+54-30
Original file line numberDiff line numberDiff line change
@@ -42808,7 +42808,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
4280842808
}
4280942809
}
4281042810

42811-
MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42811+
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
4281242812
{
4281342813
ma_uint64 iSample;
4281442814

@@ -43103,10 +43103,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
4310343103
sampleCount = frameCount * channels;
4310443104

4310543105
if (volume == 1) {
43106+
#pragma clang loop vectorize(enable)
4310643107
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4310743108
pDst[iSample] += pSrc[iSample];
4310843109
}
4310943110
} else {
43111+
#pragma clang loop vectorize(enable)
4311043112
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4311143113
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
4311243114
}
@@ -45407,7 +45409,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
4540745409
const float a1 = pBQ->a1.f32;
4540845410
const float a2 = pBQ->a2.f32;
4540945411

45410-
MA_ASSUME(channels > 0);
45412+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45413+
#pragma clang loop unroll(disable)
4541145414
for (c = 0; c < channels; c += 1) {
4541245415
float r1 = pBQ->pR1[c].f32;
4541345416
float r2 = pBQ->pR2[c].f32;
@@ -45439,7 +45442,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
4543945442
const ma_int32 a1 = pBQ->a1.s32;
4544045443
const ma_int32 a2 = pBQ->a2.s32;
4544145444

45442-
MA_ASSUME(channels > 0);
45445+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45446+
#pragma clang loop unroll(disable)
4544345447
for (c = 0; c < channels; c += 1) {
4544445448
ma_int32 r1 = pBQ->pR1[c].s32;
4544545449
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45713,22 +45717,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
4571345717
return MA_SUCCESS;
4571445718
}
4571545719

45716-
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45720+
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
4571745721
{
4571845722
ma_uint32 c;
4571945723
const ma_uint32 channels = pLPF->channels;
4572045724
const float a = pLPF->a.f32;
4572145725
const float b = 1 - a;
4572245726

45723-
MA_ASSUME(channels > 0);
45727+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45728+
#pragma clang loop unroll(disable)
4572445729
for (c = 0; c < channels; c += 1) {
4572545730
float r1 = pLPF->pR1[c].f32;
45726-
float x = pX[c];
45731+
float x = pX[c];
4572745732
float y;
4572845733

45729-
y = b*x + a*r1;
45734+
y = b * x + a * r1;
4573045735

45731-
pY[c] = y;
45736+
pY[c] = y;
4573245737
pLPF->pR1[c].f32 = y;
4573345738
}
4573445739
}
@@ -45740,7 +45745,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
4574045745
const ma_int32 a = pLPF->a.s32;
4574145746
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4574245747

45743-
MA_ASSUME(channels > 0);
45748+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45749+
#pragma clang loop unroll(disable)
4574445750
for (c = 0; c < channels; c += 1) {
4574545751
ma_int32 r1 = pLPF->pR1[c].s32;
4574645752
ma_int32 x = pX[c];
@@ -46593,7 +46599,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
4659346599
const float a = 1 - pHPF->a.f32;
4659446600
const float b = 1 - a;
4659546601

46596-
MA_ASSUME(channels > 0);
46602+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4659746603
for (c = 0; c < channels; c += 1) {
4659846604
float r1 = pHPF->pR1[c].f32;
4659946605
float x = pX[c];
@@ -46613,7 +46619,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
4661346619
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
4661446620
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4661546621

46616-
MA_ASSUME(channels > 0);
46622+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4661746623
for (c = 0; c < channels; c += 1) {
4661846624
ma_int32 r1 = pHPF->pR1[c].s32;
4661946625
ma_int32 x = pX[c];
@@ -48721,6 +48727,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4872148727
ma_uint64 iFrame;
4872248728
ma_uint32 iChannel;
4872348729
ma_uint64 interpolatedFrameCount;
48730+
const ma_uint32 channels = pGainer->config.channels;
4872448731

4872548732
MA_ASSERT(pGainer != NULL);
4872648733

@@ -48760,12 +48767,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4876048767
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
4876148768
float d = 1.0f / pGainer->config.smoothTimeInFrames;
4876248769

48763-
if (pGainer->config.channels <= 32) {
48770+
if (channels <= 32) {
4876448771
float pRunningGain[32];
4876548772
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
4876648773

4876748774
/* Initialize the running gain. */
48768-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48775+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4876948776
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
4877048777
pRunningGainDelta[iChannel] = t * d;
4877148778
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48774,7 +48781,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4877448781
iFrame = 0;
4877548782

4877648783
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48777-
if (pGainer->config.channels == 2) {
48784+
if (channels == 2) {
4877848785
#if defined(MA_SUPPORT_SSE2)
4877948786
if (ma_has_sse2()) {
4878048787
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48822,6 +48829,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4882248829

4882348830
iFrame = unrolledLoopCount << 1;
4882448831
#else
48832+
#pragma clang loop vectorize(enable)
4882548833
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4882648834
for (iChannel = 0; iChannel < 2; iChannel += 1) {
4882748835
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48833,7 +48841,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4883348841
}
4883448842
#endif
4883548843
}
48836-
} else if (pGainer->config.channels == 6) {
48844+
} else if (channels == 6) {
4883748845
#if defined(MA_SUPPORT_SSE2)
4883848846
if (ma_has_sse2()) {
4883948847
/*
@@ -48866,6 +48874,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4886648874
} else
4886748875
#endif
4886848876
{
48877+
#pragma clang loop vectorize(enable)
4886948878
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4887048879
for (iChannel = 0; iChannel < 6; iChannel += 1) {
4887148880
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48877,7 +48886,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4887748886
}
4887848887
}
4887948888
}
48880-
} else if (pGainer->config.channels == 8) {
48889+
} else if (channels == 8) {
4888148890
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
4888248891
#if defined(MA_SUPPORT_SSE2)
4888348892
if (ma_has_sse2()) {
@@ -48897,6 +48906,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4889748906
#endif
4889848907
{
4889948908
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48909+
#pragma clang loop vectorize(enable)
4890048910
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4890148911
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4890248912
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -48910,17 +48920,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4891048920
}
4891148921
}
4891248922

48923+
#pragma clang loop unroll(disable)
4891348924
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48914-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48915-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48925+
#pragma clang loop vectorize(enable)
48926+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48927+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
4891648928
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4891748929
}
4891848930
}
4891948931
} else {
4892048932
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48933+
#pragma clang loop unroll(disable)
4892148934
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48922-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48923-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48935+
#pragma clang loop vectorize(enable)
48936+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48937+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
4892448938
}
4892548939

4892648940
a += d;
@@ -48939,18 +48953,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4893948953

4894048954
/* All we need to do here is apply the new gains using an optimized path. */
4894148955
if (pFramesOut != NULL && pFramesIn != NULL) {
48942-
if (pGainer->config.channels <= 32) {
48956+
if (channels <= 32) {
4894348957
float gains[32];
48944-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48958+
#pragma clang loop unroll(disable)
48959+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4894548960
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4894648961
}
4894748962

48948-
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
48963+
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
4894948964
} else {
4895048965
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48966+
#pragma clang loop unroll(disable)
4895148967
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48952-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48953-
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48968+
#pragma clang loop vectorize(enable)
48969+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48970+
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4895448971
}
4895548972
}
4895648973
}
@@ -51320,7 +51337,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
5132051337

5132151338
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
5132251339

51323-
MA_ASSUME(channels > 0);
51340+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5132451341
for (c = 0; c < channels; c += 1) {
5132551342
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
5132651343
pFrameOut[c] = s;
@@ -51339,7 +51356,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
5133951356

5134051357
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
5134151358

51342-
MA_ASSUME(channels > 0);
51359+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5134351360
for (c = 0; c < channels; c += 1) {
5134451361
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
5134551362
pFrameOut[c] = s;
@@ -52574,6 +52591,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
5257452591
ma_uint64 iFrame;
5257552592
ma_uint32 iChannelOut;
5257652593

52594+
#pragma clang loop unroll(disable)
5257752595
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5257852596
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5257952597
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52594,6 +52612,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
5259452612
ma_uint64 iFrame;
5259552613
ma_uint32 iChannelOut;
5259652614

52615+
#pragma clang loop unroll(disable)
5259752616
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5259852617
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5259952618
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52636,6 +52655,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
5263652655
ma_uint64 iFrame;
5263752656
ma_uint32 iChannelOut;
5263852657

52658+
#pragma clang loop unroll(disable)
5263952659
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5264052660
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5264152661
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52656,6 +52676,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
5265652676
ma_uint64 iFrame;
5265752677
ma_uint32 iChannelOut;
5265852678

52679+
#pragma clang loop unroll(disable)
5265952680
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5266052681
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5266152682
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52890,6 +52911,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5289052911
} else
5289152912
#endif
5289252913
{
52914+
#pragma clang loop vectorize(enable)
5289352915
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5289452916
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
5289552917
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -52917,6 +52939,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5291752939
} else
5291852940
#endif
5291952941
{
52942+
#pragma clang loop vectorize(enable)
5292052943
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5292152944
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
5292252945
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -52934,6 +52957,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5293452957
} else
5293552958
#endif
5293652959
{
52960+
#pragma clang loop vectorize(enable)
5293752961
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5293852962
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
5293952963
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -66257,7 +66281,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
6625766281
ma_uint64 iFrame;
6625866282
ma_uint32 iChannel;
6625966283
const ma_uint32 channels = pNoise->config.channels;
66260-
MA_ASSUME(channels > 0);
66284+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6626166285

6626266286
if (pNoise->config.format == ma_format_f32) {
6626366287
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66376,7 +66400,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
6637666400
ma_uint64 iFrame;
6637766401
ma_uint32 iChannel;
6637866402
const ma_uint32 channels = pNoise->config.channels;
66379-
MA_ASSUME(channels > 0);
66403+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6638066404

6638166405
if (pNoise->config.format == ma_format_f32) {
6638266406
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66458,7 +66482,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
6645866482
ma_uint64 iFrame;
6645966483
ma_uint32 iChannel;
6646066484
const ma_uint32 channels = pNoise->config.channels;
66461-
MA_ASSUME(channels > 0);
66485+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6646266486

6646366487
if (pNoise->config.format == ma_format_f32) {
6646466488
float* pFramesOutF32 = (float*)pFramesOut;

0 commit comments

Comments
 (0)