- char *sp = (char *)src;
- char *dp = (char *)dest;
- for(;count--;) *dp++ = *sp++;
- return dest;
+ const int wordmask = sizeof(void*)-1;
+ uintptr_t src = (uintptr_t)__src;
+ uintptr_t dst = (uintptr_t)__dest;
+
+ if( count < sizeof(void*)*2 || (dst & wordmask) != (src & wordmask) )
+ {
+ char *dp = __dest;
+ const char *sp = __src;
+ while(count--) *dp++ = *sp ++;
+ }
+ // TODO: Bulk aligned copies
+ #if 0
+ else if(count > 128 && (dst & 15) == (src & 15) )
+ {
+ // SSE/bulk copy
+ for( ; dst & 15; count -- )
+ *(char*)dst++ = *(char*)src++;
+ memcpy_16byte(dst, src, count / 16);
+ dst += count & ~15;
+ src += count & ~15;
+ count &= 15;
+ while(count --)
+ *(char*)dst++ = *(char*)src++;
+ }
+ #endif
+ else
+ {
+ void **dp, **sp;
+ for( ; count && (dst & wordmask) != 0; count -- )
+ *(char*)dst++ = *(char*)src++;
+
+ dp = (void*)dst; sp = (void*)src;
+ while( count >= sizeof(void*) )
+ {
+ *dp++ = *sp++;
+ count -= sizeof(void*);
+ }
+ dst = (uintptr_t)dp; src = (uintptr_t)sp;
+ for( ; count; count -- )
+ *(char*)dst++ = *(char*)src++;
+ }
+
+ return __dest;