This test is done under following environment.
x86 :
intel Core(TM)2 Duo T9400 2.53Mhz
GCC-4.4.5
ARM :
OMAP4430 (Cortax-A9)
Android NDK platform 9
Test code.
/*
* Test Configuration
*/
#define _ARRSZ 1024*1024*8
static int _arr[_ARRSZ];
static void
_init() {
int i;
for (i = 0; i < _ARRSZ; i++)
_arr[i] = i;
}
static unsigned long long
_utime() {
struct timeval tv;
if (gettimeofday(&tv, NULL))
assert(0);
return (unsigned long long)(tv.tv_sec * 1000000)
+ (unsigned long long)tv.tv_usec;
}
#define _test_arraycopy_pre() \
int i; \
unsigned long long ut; \
int* ia = malloc(_ARRSZ * sizeof(*ia));
#define _test_arraycopy_post() \
free(ia);
#define _operation() \
do { \
ia[i] = _arr[i]; \
} while (0)
static void*
_test_arraycopy_worker(void* arg) {
int i;
int* ia = arg;
for (i = (_ARRSZ / 2); i < _ARRSZ; i++)
_operation();
return NULL;
}
static unsigned long long
_test_arraycopy_sc() {
_test_arraycopy_pre();
ut = _utime();
for (i = 0; i < _ARRSZ; i++)
_operation();
ut = _utime() - ut;
_test_arraycopy_post();
return ut;
}
static unsigned long long
_test_arraycopy_dc() {
pthread_t thd;
void* ret;
_test_arraycopy_pre();
ut = _utime();
if (pthread_create(&thd,
NULL,
&_test_arraycopy_worker,
(void*)ia))
assert(0);
for (i = 0; i < (_ARRSZ / 2); i++)
_operation();
if (pthread_join(thd, &ret))
assert(0);
ut = _utime() - ut;
_test_arraycopy_post();
return ut;
}
#undef _test_arraycopy_pre
#undef _test_arraycopy_post
int
main(int argc, char* argv[]) {
_init();
printf(">> SC : %lld ", _test_arraycopy_sc());
printf(">> DC : %lld\n", _test_arraycopy_dc());
return 0;
}
[Test 1]
x86
>> SC : 59346 >> DC : 38566
>> SC : 59195 >> DC : 39028
>> SC : 49529 >> DC : 38160
>> SC : 49722 >> DC : 38457
>> SC : 49952 >> DC : 37457
ARM
>> SC : 102295 >> DC : 94147
>> SC : 102264 >> DC : 94025
>> SC : 102173 >> DC : 94116
>> SC : 102172 >> DC : 94116
>> SC : 102325 >> DC : 94177
Change '_operation' macro to as follows
#define _operation() \
do { \
if (i > _ARRSZ / 2) \
ia[i] = (_arr[i] & 0xff) << 8 ^ _arr[i]; \
else \
ia[i] = (_arr[i] & 0xff) << 16 ^ _arr[i]; \
} while (0) \
[Test 2]
x86
>> SC : 60696 >> DC : 40523
>> SC : 56907 >> DC : 45355
>> SC : 55066 >> DC : 42329
>> SC : 54931 >> DC : 40651
>> SC : 57022 >> DC : 41879
ARM
>> SC : 164514 >> DC : 112671
>> SC : 163971 >> DC : 112854
>> SC : 164521 >> DC : 112976
>> SC : 163940 >> DC : 112732
>> SC : 164245 >> DC : 112671
Interesting result, isn't it?
For heavily-memory-accessing-code (Test 1), ARM does not show good statistics for multi-core (in this case, dual-core) optimization.
But, if not (Test 2), optimization shows quite good results.
And, x86 seems to handle memory accessing from multi-core, quite well.
So, developers should consider ARM's characteristic when optimize codes for multi-core.
(I'm sure that ARM will improve this someday! :-) )
* Things to consider regarding this kind of optimization *
Cache, Cache coherence, Memory Controller, Bus etc...
[ Test for later version of ARM (ex Cortax-A15) will be listed continuously... ]