I would do the SSE version slightly differently. No need for bitwise tricks, SSE2 has an instruction to compare bytes, which saves 1 instruction and one constant vector.
int luhn( const char* s )
{
__m128i r = _mm_loadu_si128( ( const __m128i* )s );
// Decode ASCII
r = _mm_subs_epu8( r, _mm_set1_epi8( 0x30 ) );
// Double every other digit
__m128i m = _mm_set1_epi16( 0x00ff );
r = _mm_add_epi8( r, _mm_and_si128( r, m ) );
// if( digit > 9 ) digit -= 9
const __m128i nine = _mm_set1_epi8( 9 );
__m128i gt = _mm_cmpgt_epi8( r, nine );
r = _mm_sub_epi8( r, _mm_and_si128( gt, nine ) );
// Horizontal sum
r = _mm_sad_epu8( r, _mm_setzero_si128() );
r = _mm_add_epi64( r, _mm_srli_si128( r, 8 ) );
return _mm_cvtsi128_si32( r ) % 10;
}