This is a modernization of my classic 1993 paper "Guidelines for signal processing applications in C", incorporating C11/C17/C23 features, SIMD intrinsics, OpenMP, and contemporary optimization techniques.
The C programming language remains essential for high-performance signal processing, but the landscape has changed dramatically since 1993. Modern processors feature:
This guide presents recommendations for efficiently implementing signal processing algorithms using modern C (C11/C17/C23) without sacrificing clarity or portability.
Old (1993): Relied on char, short, int, long with implementation-defined sizes.
Modern (2026): Use <stdint.h> for guaranteed sizes:
#include <stdint.h>
// Exact width integers
int8_t, uint8_t // 8-bit
int16_t, uint16_t // 16-bit (CD-quality audio samples)
int32_t, uint32_t // 32-bit (sample counters, phase accumulators)
int64_t, uint64_t // 64-bit (high-precision timing)
// Fast types (at least N bits, optimized for speed)
int_fast32_t // Fastest type with at least 32 bits
uint_fast32_t // Useful for loop counters
// Size types
size_t // Array indices and sizes
ptrdiff_t // Pointer differences
<stdint.h> for signal data; use size_t for array indices.
#include <math.h>
// Single precision (32-bit IEEE 754)
float sample; // Audio samples, filter coefficients
// Suffix: 3.14159f
// Double precision (64-bit IEEE 754)
double timestamp; // High-precision timing, accumulations
// No suffix: 3.14159
// Check for specific features
#ifdef __STDC_IEC_559__
// IEEE 754 floating-point is guaranteed
#endif
float for audio samples and real-time calculations. Use double for time measurements and accumulations where precision matters.
#include <stdint.h>
// Modern audio sample types
typedef float sample_t; // Normalized audio sample [-1.0, 1.0]
typedef uint32_t phase_t; // Phase accumulator (wraps naturally)
typedef float frequency_t; // Frequency in Hz
typedef double timestamp_t; // Time in seconds
typedef int32_t framecount_t; // Sample frame counter
// Complex numbers (C99)
#include <complex.h>
float complex z = 1.0f + 2.0f * I;
double complex w = cexp(I * 2.0 * M_PI / 8.0);
Modern SIMD instructions require properly aligned memory (16-byte for SSE, 32-byte for AVX, 64-byte for AVX-512).
#include <stdlib.h>
#include <stdalign.h>
// Allocate 32-byte aligned memory for AVX
float *samples = aligned_alloc(32, n_samples * sizeof(float));
if (!samples) {
// Handle allocation failure
}
// Always free aligned memory
free(samples);
#include <stdalign.h>
// Align stack arrays
alignas(32) float buffer[1024];
// C23: alignas can use types
alignas(max_align_t) char storage[4096];
aligned_alloc() for heap allocations and alignas() for stack arrays.
The restrict qualifier tells the compiler that pointer arguments don't alias, enabling aggressive optimizations.
void vector_add(float *a,
float *b,
float *result,
size_t n) {
for (size_t i = 0; i < n; i++) {
result[i] = a[i] + b[i];
}
}
void vector_add(
const float * restrict a,
const float * restrict b,
float * restrict result,
size_t n) {
// Compiler can aggressively
// vectorize and reorder
for (size_t i = 0; i < n; i++) {
result[i] = a[i] + b[i];
}
}
restrict on all non-overlapping pointer parameters in performance-critical functions.
Modern processors provide SIMD instructions that process multiple data elements in parallel.
#include <stddef.h>
// Write clean, simple loops - let the compiler vectorize
void vector_scale(const float * restrict input,
float * restrict output,
float scale,
size_t n) {
// Compiler will auto-vectorize with -O3 -march=native
for (size_t i = 0; i < n; i++) {
output[i] = input[i] * scale;
}
}
# GCC/Clang
gcc -O3 -march=native -ffast-math signal_processing.c
# Intel ICC
icc -O3 -xHost -fp-model fast=2 signal_processing.c
#include <immintrin.h> // AVX/AVX2/AVX-512
#include <stddef.h>
void vector_add_avx(const float * restrict a,
const float * restrict b,
float * restrict result,
size_t n) {
size_t i = 0;
// Process 8 floats at a time with AVX
for (; i + 8 <= n; i += 8) {
__m256 va = _mm256_load_ps(&a[i]); // Load 8 floats
__m256 vb = _mm256_load_ps(&b[i]); // Load 8 floats
__m256 vr = _mm256_add_ps(va, vb); // Add 8 floats in parallel
_mm256_store_ps(&result[i], vr); // Store 8 floats
}
// Handle remaining elements
for (; i < n; i++) {
result[i] = a[i] + b[i];
}
}
#include <arm_neon.h>
void vector_multiply_neon(const float * restrict a,
const float * restrict b,
float * restrict result,
size_t n) {
size_t i = 0;
// Process 4 floats at a time with NEON
for (; i + 4 <= n; i += 4) {
float32x4_t va = vld1q_f32(&a[i]);
float32x4_t vb = vld1q_f32(&b[i]);
float32x4_t vr = vmulq_f32(va, vb);
vst1q_f32(&result[i], vr);
}
// Scalar tail
for (; i < n; i++) {
result[i] = a[i] * b[i];
}
}
#include <stddef.h>
void vector_process(const float * restrict input,
float * restrict output,
size_t n) {
#pragma omp simd aligned(input, output: 32)
for (size_t i = 0; i < n; i++) {
output[i] = input[i] * 2.0f + 1.0f;
}
}
#include <omp.h>
void convolution_parallel(const float * restrict signal,
const float * restrict kernel,
float * restrict output,
size_t signal_len,
size_t kernel_len) {
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < signal_len; i++) {
float sum = 0.0f;
#pragma omp simd reduction(+:sum)
for (size_t k = 0; k < kernel_len; k++) {
if (i >= k) {
sum += signal[i - k] * kernel[k];
}
}
output[i] = sum;
}
}
gcc -O3 -march=native -fopenmp signal.c
#pragma omp simd for vectorization hints and #pragma omp parallel for for multi-core parallelization. Combine them for maximum performance.
// Modern replacement for macros
static inline float clamp(float x, float min, float max) {
return (x < min) ? min : (x > max) ? max : x;
}
// Compiler will inline and optimize
static inline float lerp(float a, float b, float t) {
return a + t * (b - a);
}
// Function will be called frequently
__attribute__((hot))
void audio_callback(float *buffer, size_t n);
// Function is rarely executed
__attribute__((cold))
void error_handler(const char *msg);
// Function has no side effects (pure)
__attribute__((const))
float compute_frequency(int midi_note);
// Memory access only through pointers
__attribute__((pure))
float calculate_rms(const float *buffer, size_t n);
// Pointer arguments never NULL
void process_audio(float *buffer __attribute__((nonnull)), size_t n);
static inline for small utility functions. Enable LTO for maximum cross-module optimization.
Here's a complete modern example comparing old and new approaches:
void sun_vaddosb(float *vect_in1, unsigned long incr_in1,
float *vect_in2, unsigned long incr_in2,
float *vect_out, unsigned long incr_out,
unsigned long vect_length) {
register float tmp, tmp_1, tmp_2;
if ((incr_in1 == 1) && (incr_in2 == 1) && (incr_out == 1)) {
while (vect_length--) {
tmp_1 = *vect_in1++;
tmp_2 = *vect_in2++;
tmp = tmp_1 - tmp_2;
*vect_out++ = (tmp_1 + tmp_2) / tmp;
}
} else {
while (vect_length--) {
tmp_1 = *vect_in1;
tmp_2 = *vect_in2;
tmp = tmp_1 - tmp_2;
*vect_out = (tmp_1 + tmp_2) / tmp;
vect_in1 += incr_in1;
vect_in2 += incr_in2;
vect_out += incr_out;
}
}
}
#include <stddef.h>
#include <math.h>
// Simple, clear interface
static inline void vector_addoversub(
const float * restrict a,
const float * restrict b,
float * restrict result,
size_t n) {
// Let compiler vectorize - no manual optimization
#pragma omp simd aligned(a, b, result: 32)
for (size_t i = 0; i < n; i++) {
float sum = a[i] + b[i];
float diff = a[i] - b[i];
result[i] = sum / diff;
}
}
# GCC/Clang - Maximum optimization
gcc -std=c17 \
-O3 \
-march=native \ # Use all CPU features available
-mtune=native \ # Optimize for this specific CPU
-ffast-math \ # Aggressive float optimizations
-flto \ # Link-time optimization
-fomit-frame-pointer \ # Extra register
-fopenmp \ # OpenMP support
signal_processing.c -o signal_opt
# Intel ICC (now Intel oneAPI DPC++/C++)
icx -std=c17 \
-O3 \
-xHost \ # Optimize for current CPU
-ipo \ # Interprocedural optimization
-qopenmp \
-fp-model fast=2 \ # Aggressive FP optimization
signal_processing.c -o signal_opt
# Step 1: Compile with instrumentation
gcc -O3 -march=native -fprofile-generate signal.c -o signal_prof
# Step 2: Run with representative data
./signal_prof < typical_input.wav
# Step 3: Rebuild with profile data
gcc -O3 -march=native -fprofile-use signal.c -o signal_pgo
#include <math.h>
#include <tgmath.h> // C11 type-generic math
// Old: sin() always returns double
double phase = sin(2.0 * M_PI * freq * time);
// Modern: type-generic, uses sinf() for float
float phase_f = sin(2.0f * M_PI * freq * time); // Calls sinf()
// Fused multiply-add (much faster than separate mul+add)
float result = fmaf(a, b, c); // Computes a*b + c in one operation
// Intel MKL (Math Kernel Library)
#include <mkl.h>
vsAdd(n, a, b, result); // Vectorized addition
vsSin(n, input, output); // Vectorized sine
// Apple Accelerate Framework (macOS/iOS)
#include <Accelerate/Accelerate.h>
vDSP_vadd(a, 1, b, 1, result, 1, n); // Vector add
vDSP_vsq(input, 1, output, 1, n); // Vector square
Modern CPUs benefit from explicit prefetch hints for large datasets:
#include <xmmintrin.h> // For _mm_prefetch
void process_large_buffer(const float * restrict input,
float * restrict output,
size_t n) {
const size_t prefetch_distance = 64; // Cache lines ahead
for (size_t i = 0; i < n; i++) {
// Prefetch data that will be needed soon
if (i + prefetch_distance < n) {
_mm_prefetch(&input[i + prefetch_distance], _MM_HINT_T0);
}
// Process current element
output[i] = process_sample(input[i]);
}
}
// Process 2D audio data (e.g., multichannel) with cache blocking
void process_multichannel_blocked(
float ** restrict channels,
size_t n_channels,
size_t n_samples,
void (*process)(float*, size_t)) {
const size_t block_size = 4096; // Fits in L1 cache
for (size_t block = 0; block < n_samples; block += block_size) {
size_t block_end = (block + block_size < n_samples)
? block + block_size
: n_samples;
// Process all channels for this block
for (size_t ch = 0; ch < n_channels; ch++) {
process(&channels[ch][block], block_end - block);
}
}
}
#include <stdalign.h>
// Bad: threads writing to adjacent elements cause cache ping-pong
typedef struct {
float result;
} thread_data_bad;
// Good: pad to cache line size (typically 64 bytes)
typedef struct {
alignas(64) float result;
char padding[64 - sizeof(float)];
} thread_data_good;
<stdint.h> fixed-width typesfloat for samples, double for accumulationsize_t for array indicescomplex.h for complex numbersaligned_alloc() for SIMD (32-byte for AVX)alignas() for stack arraysrestrict qualifier on all non-aliasing pointersconst when appropriatestatic inline for small utility functionshot, cold, pure, const)-flto#pragma omp simd for hints#pragma omp parallel for#pragma omp simd for nested parallelism-std=c17 # Modern C standard
-O3 # Aggressive optimization
-march=native # Use all available CPU features
-flto # Link-time optimization
-ffast-math # Aggressive float optimization (when safe)
-fopenmp # OpenMP support
perf (Linux), Instruments (macOS), VTune (Intel)#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdalign.h>
typedef float sample_t;
// Modern FIR filter with all optimizations
void fir_filter_modern(
const sample_t * restrict input,
const sample_t * restrict coeffs,
sample_t * restrict output,
size_t n_samples,
size_t n_taps) {
// Parallel across output samples
#pragma omp parallel for schedule(static) if(n_samples > 512)
for (size_t i = 0; i < n_samples; i++) {
sample_t acc = 0.0f;
// Vectorize inner product
#pragma omp simd reduction(+:acc) aligned(input, coeffs: 32)
for (size_t j = 0; j < n_taps && j <= i; j++) {
acc = fmaf(input[i - j], coeffs[j], acc); // FMA
}
output[i] = acc;
}
}
// Usage example
int main(void) {
const size_t n_samples = 48000; // 1 second at 48kHz
const size_t n_taps = 128;
// Allocate aligned memory
sample_t *input = aligned_alloc(32, n_samples * sizeof(sample_t));
sample_t *coeffs = aligned_alloc(32, n_taps * sizeof(sample_t));
sample_t *output = aligned_alloc(32, n_samples * sizeof(sample_t));
// ... initialize input and coeffs ...
fir_filter_modern(input, coeffs, output, n_samples, n_taps);
free(input);
free(coeffs);
free(output);
return 0;
}
gcc -std=c17 -O3 -march=native -fopenmp -ffast-math fir.c -o fir -lm
Signal processing in C has evolved dramatically since 1993. Modern approaches emphasize:
restrict, inline, fixed-width types, alignmentThe 1993 paper's core insight remains true: "Write simple, clear programs and examine their performance" - but now compilers and hardware give us far more performance from simple code than was possible 30 years ago.
Standards: ISO/IEC 9899:2018 (C17), ISO/IEC 9899:2023 (C23)
Documentation: Intel Intrinsics Guide, ARM NEON Intrinsics Reference, OpenMP 5.2 Specification, GCC Optimization Options
Libraries: FFTW, Intel MKL, Apple Accelerate, ARM Performance Libraries