This test is done under following environment.

x86 :

intel Core(TM)2 Duo T9400 2.53Mhz
GCC-4.4.5

ARM :

OMAP4430 (Cortax-A9)
Android NDK platform 9

Test code.

/*
 * Test Configuration
 */
#define _ARRSZ    1024*1024*8

static int _arr[_ARRSZ];

static void
_init() {
    int i;
    for (i = 0; i < _ARRSZ; i++)
        _arr[i] = i;
}

static unsigned long long
_utime() {
    struct timeval tv;
    if (gettimeofday(&tv, NULL))
        assert(0);
    return (unsigned long long)(tv.tv_sec * 1000000)
        + (unsigned long long)tv.tv_usec;
}

#define _test_arraycopy_pre()               \
    int  i;                                 \
    unsigned long long ut;                  \
    int* ia = malloc(_ARRSZ * sizeof(*ia));

#define _test_arraycopy_post()              \
    free(ia);

#define _operation()            \
    do {                        \
        ia[i] = _arr[i];        \
    } while (0)

static void*
_test_arraycopy_worker(void* arg) {
    int     i;
    int*    ia = arg;
    for (i = (_ARRSZ / 2); i < _ARRSZ; i++)
        _operation();
    return NULL;
}

static unsigned long long
_test_arraycopy_sc() {
    _test_arraycopy_pre();

    ut = _utime();
    for (i = 0; i < _ARRSZ; i++)
        _operation();
    ut = _utime() - ut;

    _test_arraycopy_post();

    return ut;
}

static unsigned long long
_test_arraycopy_dc() {
    pthread_t thd;
    void*     ret;
    _test_arraycopy_pre();

    ut = _utime();
    if (pthread_create(&thd,
               NULL,
               &_test_arraycopy_worker,
               (void*)ia))
        assert(0);

    for (i = 0; i < (_ARRSZ / 2); i++)
        _operation();

    if (pthread_join(thd, &ret))
        assert(0);

    ut = _utime() - ut;

    _test_arraycopy_post();

    return ut;
}

#undef _test_arraycopy_pre
#undef _test_arraycopy_post

int
main(int argc, char* argv[]) {
    _init();
    printf(">> SC : %lld ", _test_arraycopy_sc());
    printf(">> DC : %lld\n", _test_arraycopy_dc());
    return 0;
}

[Test 1]
x86

>> SC : 59346 >> DC : 38566
>> SC : 59195 >> DC : 39028
>> SC : 49529 >> DC : 38160
>> SC : 49722 >> DC : 38457
>> SC : 49952 >> DC : 37457

ARM

>> SC : 102295 >> DC : 94147
>> SC : 102264 >> DC : 94025
>> SC : 102173 >> DC : 94116
>> SC : 102172 >> DC : 94116
>> SC : 102325 >> DC : 94177

Change '_operation' macro to as follows

#define _operation()                                    \
    do {                                                \
        if (i > _ARRSZ / 2)                             \
            ia[i] = (_arr[i] & 0xff) << 8 ^ _arr[i];    \
        else                                            \
            ia[i] = (_arr[i] & 0xff) << 16 ^ _arr[i];   \
    } while (0)                                         \

[Test 2]
x86

>> SC : 60696 >> DC : 40523
>> SC : 56907 >> DC : 45355
>> SC : 55066 >> DC : 42329
>> SC : 54931 >> DC : 40651
>> SC : 57022 >> DC : 41879

ARM

>> SC : 164514 >> DC : 112671
>> SC : 163971 >> DC : 112854
>> SC : 164521 >> DC : 112976
>> SC : 163940 >> DC : 112732
>> SC : 164245 >> DC : 112671

Interesting result, isn't it?
For heavily-memory-accessing-code (Test 1), ARM does not show good statistics for multi-core (in this case, dual-core) optimization.
But, if not (Test 2), optimization shows quite good results.
And, x86 seems to handle memory accessing from multi-core, quite well.

So, developers should consider ARM's characteristic when optimize codes for multi-core.
(I'm sure that ARM will improve this someday! :-) )

* Things to consider regarding this kind of optimization *
Cache, Cache coherence, Memory Controller, Bus etc...

[ Test for later version of ARM (ex Cortax-A15) will be listed continuously... ]

'Domain > ARM' 카테고리의 다른 글

[ARM] Sample code for unwinding stack with DWARF2 information.  (0) 2010.04.07
[ARM] Sample code for unwinding stack in Thumb mode.  (0) 2010.04.07
[ARM] Unwinding Stack.  (0) 2009.09.15
[ARM] Long jump.  (0) 2007.06.05
[ARM] .init_array section  (0) 2007.03.18

Following code is part of software - written by me - that is used to analyze memory data.
Even if this just small part of software, I think this can help understand the way of unwinding stack by using DWARF information. Following sample code is to analyze DWARF information generated from C code.
For more details about unwinding stack. See this post.

const CDwf_Dbg*
CFrameS::_Get_dwf_dbg(DvAddr pc) const
{
  CDwf_Dbg*   dwf_dbg = NULL;

  for(int i=0; i<_dwfs->Size(); i++) {
    if( (*_dwfs)[i]->Is_in_fde(pc) ) { dwf_dbg = (*_dwfs)[i]; break; }
  }
  return dwf_dbg;
}

// "frm->regs" should have valid values before call this function.
int
CFrameS::_Set_frame_info(_CFrm_Info* fi) const
{
  if(REG_VAL_UNDEF==fi->regs[_tgt->Pci()].from) { return -1; }

  const CDwf_Dbg* dwf_dbg = _Get_dwf_dbg(REG_PC(fi->regs[_tgt->Pci()].val));

  if(dwf_dbg)
  {
    if( dwf_dbg->ArangeS() ) {
      ASSERT(REG_VAL_UNDEF != fi->regs[_tgt->Pci()].from);
      fi->fdie = dwf_dbg->ArangeS()->Subprogram_die(REG_PC(fi->regs[_tgt->Pci()].val), &fi->cudie);
    }
    else {
      fi->cudie = NULL;
      fi->fdie = NULL;
    }

    if( dwf_dbg->FdeS() ) {
      ASSERT(REG_VAL_UNDEF != fi->regs[_tgt->Pci()].from)
      fi->fde = dwf_dbg->FdeS()->Get_fde(REG_PC(fi->regs[_tgt->Pci()].val), NULL, NULL);
    }
    else { fi->fde = NULL; }

    if(fi->cudie)
    {
      CLine*  ln = CLine::Create(*fi->cudie);
      ln->Lineno(REG_PC(fi->regs[_tgt->Pci()].val), &fi->line, &fi->column);
      if(ln) { MDELETE(ln); }
    }
  }
  else
  {
    // There is no debugging information about this.
    // But we have address value of PC. So, it's not fail!
    fi->cudie = NULL;
    fi->fdie = NULL;
    fi->fde = NULL;
  }

  return 0;
}

int
CFrameS::_Create_previous_frame_info(_CFrm_Info* outfi, const _CFrm_Info* fi) const
{
  ASSERT(outfi && fi);

  bool          b_exact = true;
#ifdef __RT_DEBUG__
  DvAddr        fde_lopc, fdie_lopc;
#endif // __RT_DEBUG__

  if( (REG_VAL_UNDEF == fi->regs[_tgt->Pci()].from)
      || (REG_VAL_UNDEF == fi->regs[_tgt->Spi()].from) ) { goto no_info; }

  const CDwf_Dbg* dwf_dbg = _Get_dwf_dbg(REG_PC(fi->regs[_tgt->Pci()].val));

  if(dwf_dbg && dwf_dbg->FdeS() && fi->fde)
  {
    DvHalf      lri = dwf_dbg->FdeS()->Get_return_addr_register(fi->fde);

    // initial reg values of this frame
    if(0 > dwf_dbg->FdeS()->Get_all_regs(outfi->regs, fi->regs,
                                          fi->fde, REG_PC(fi->regs[_tgt->Pci()].val)))
    { goto fail; }

    if(REG_VAL_UNDEF == outfi->regs[lri].from) { goto fail; }

    outfi->regs[_tgt->Pci()] = outfi->regs[lri];
    COMPENSATE_PC_AHEAD_OF_LR(outfi->regs[_tgt->Pci()].val);

    if(fi->fdie && fi->cudie)
    {
#ifdef __RT_DEBUG__
      fde_lopc = dwf_dbg->FdeS()->Fde_lopc(fi->fde);
      if(0 > fi->fdie->LoPC(&fdie_lopc)) { goto fail; }

      if(fde_lopc != fdie_lopc) { ASSERT(FALSE); goto fail; }
#endif // __RT_DEBUG__

      if(fi->fdie->Has(DW_AT_frame_base))
      {
        DvUSign             val;
        int                 val_type;
        DvAddr              culo;
        bool                b_exact = false;
        CArr<CLoc_Desc*> locarr;
        if(0 > fi->fdie->Frame_base(&locarr) ) {goto fail; }
        ASSERT(REG_VAL_UNDEF != fi->regs[_tgt->Pci()].from);
        if(0 > fi->cudie->LoPC(&culo)) { goto fail; }

        if(0 > CLocS::Interpret(&val_type, &val, &b_exact,_tgt,
                                REG_PC(fi->regs[_tgt->Pci()].val) - culo,
                                INVALID_FRAME_BASE, fi->regs, locarr) ) { goto fail; }
        if(!b_exact){ goto fail; } // this should be exact value.!

        //ASSERT( (LOC_VAL_REG == val_type) || (LOC_VAL_UCONST == val_type) );
        outfi->regs[_tgt->Spi()].val = val;
        outfi->regs[_tgt->Spi()].from = REG_VAL_FROM_REG;
      } // if(fi->fdie->Has(DW_AT_frame_base))
      else if(fi->fdie->Has(DW_AT_abstract_origin) )
      { // this may be inlined function
        CDieI* ofdie = fi->fdie->Original_die(ORI_PURE);

#ifdef  __RT_DEBUG__
        ODPRINTF(("========*********=========\n"));
        ODPRINTF(("Tag: %s\n", dwarf_get_tagCN(fi->fdie->Tag()) ));
        fi->fdie->ODPrint_attributes();
        ODPRINTF(("=========*********============\n"));
        ODPRINTF(("Tag:%s\n", dwarf_get_tagCN(ofdie->Tag()) ));
        ofdie->ODPrint_attributes();
        //ASSERT(FALSE);
#endif // __RT_DEBUG__
        if(ofdie->Has(DW_AT_inline)) {
          outfi->regs[_tgt->Spi()] = fi->regs[_tgt->Spi()];
        }
        else { ASSERT(FALSE); }
        MDELETE(ofdie);
      }
      else { ASSERT(FALSE); }
    }
    else
    {
#ifdef COMPILER_ADS12
      DvSign off;
      if( 0 <= dwf_dbg->FdeS()->Get_cfa_offset_via_sp(&off, fi->fde, fi->regs[_tgt->Pci()].val) )
      {
        if(0 == off) { goto fail; } // If value of stack pointer is not changed, than we can assume that virtual unwinding fails.
        else {
          outfi->regs[_tgt->Spi()].val = fi->regs[_tgt->Spi()].val+off;
          outfi->regs[_tgt->Spi()].from = fi->regs[_tgt->Spi()].from;
        }
      }
      else
#endif // COMPILER_ADS12
      { goto no_info; }
    }
  }
  else  // if(dwf_dbg && dwf_dbg->FdeS() && fi->fde)
  {
    static const int  __UNW_INST_CNT  = 1000;

    bool b_success = false;
    // try to unwind stack with raw machine instruction!!
    if(_unw)
    {
      UnwRT*    unwr;
      MMNEW(unwr, , UnwRT, _tgt->Num_regs());
      for(int i=0; i<_tgt->Num_regs(); i++) {
        unwr[i].v = fi->regs[i].val;
        unwr[i].o = (REG_VAL_UNDEF==fi->regs[i].from)? UNWR_INVALID: UNWR_VALID;
      }

      if( UNW_OK == _unw->Unw(unwr, __UNW_INST_CNT, _stk_base))
      {
        for(int i=0; i<_tgt->Num_regs(); i++) {
          ASSERT( (UNWR_VALID==unwr[_tgt->Pci()].o)&&(UNWR_VALID==unwr[_tgt->Spi()].o) );
          outfi->regs[i].val = unwr[i].v;
          // We can assume that valid register value comes from stack.
          outfi->regs[i].from = (UNWR_VALID==unwr[i].o)? REG_VAL_FROM_ADDR: REG_VAL_UNDEF;
        }
        b_success = true;;
      }
      MMDELETE(unwr);
      COMPENSATE_PC_AHEAD_OF_LR(outfi->regs[_tgt->Pci()].val);
    } // if(_unw)
    if(!b_success) { goto fail; }
  } // else  // if(dwf_dbg && dwf_dbg->FdeS() && fi->fde)

  // Unexpected case. - infinite loop!! if this happened
  // If PC is on 'very strange position' this can be happened
  // previous frame is exactly same with current frame! This cannot happened in normal situation!
  if( (REG_VAL_UNDEF != outfi->regs[_tgt->Pci()].from)
      && (outfi->regs[_tgt->Spi()].val == fi->regs[_tgt->Spi()].val)
      && (outfi->regs[_tgt->Pci()].val == fi->regs[_tgt->Pci()].val)
      && (outfi->regs[_tgt->Lri()].val == fi->regs[_tgt->Lri()].val) )
  { goto fail; }

  if(0 > _Set_frame_info(outfi) ) { goto fail; }

  return (b_exact)? FRAME_OK: FRAME_NOT_EXACT;

fail:
  return FRAME_FAIL;

no_info:
  return FRAME_NO_DBG_INFO;

}

int
CFrameS::Construct_frame_stack(const RegT* regs)
{
  for(int i=0; i<_frame.Size(); i++) { MDELETE(_frame[i]); }
  _frame.Reset();

  int         frm_ret;
  bool        b_end = false;

  // flag to retry frame unwind
  bool        b_retry = true;
  _CFrm_Info* prev_fi = NULL;
  _CFrm_Info* next_fi = NULL;

  MNEW(prev_fi, , _CFrm_Info, (_num_regs));
  for(int i=0; i<_num_regs; i++) {
    prev_fi->regs[i].from = regs[i].from;
    prev_fi->regs[i].val = regs[i].val;
  }

  if(0 > _Set_frame_info(prev_fi) ) { goto fail; }
  _frame.Append(prev_fi);
  _ODPrint_frame_name(prev_fi);
  prev_fi = prev_fi->Clone();

  for(;;)
  {
    ASSERT(REG_VAL_UNDEF != prev_fi->regs[_tgt->Pci()].from);

    MNEW(next_fi, , _CFrm_Info, (_num_regs));
    frm_ret = _Create_previous_frame_info(next_fi, prev_fi);

    switch(frm_ret)
    {
      case FRAME_NOT_EXACT:
      case FRAME_OK:
        ASSERT(prev_fi && next_fi);
        _frame.Append(next_fi);
        MDELETE(prev_fi);
        prev_fi = next_fi->Clone();
        next_fi = NULL;
#ifdef __RT_DEBUG__
        if(prev_fi->fdie) {
          _ODPrint_frame_name(prev_fi);
        }
        else if(REG_VAL_UNDEF != prev_fi->regs[_tgt->Pci()].from) {
          ODPRINTF(("-- [0x%8x]\n", prev_fi->regs[_tgt->Pci()].val));
        }
#endif // __RT_DEBUG__

#ifdef ARCHITECTURE_ARM
        // In ARM, stack grows in negative direction.
        if( (REG_VAL_UNDEF != prev_fi->regs[_tgt->Spi()].from)
          && (prev_fi->regs[_tgt->Spi()].val >= _stk_base ) ) {
          frm_ret = FRAME_END;
          goto end_of_loop;
        }
#endif // ARCHITECTURE_ARM
      break;

      case FRAME_FAIL:
      case FRAME_NO_DBG_INFO:
        ASSERT(prev_fi);
        MDELETE(next_fi); 

        // Check whether previous frame is top of frame or not.
        if( (1 == _frame.Size()) && b_retry )
        {
          b_retry = false;
          ///////////////////////////////////
          // Let's try again with LR!! if previous frame is top of stack!!
          // (There are lots of cases that pc is corrupted in the top of stack!! (ex. prefetch abort))

          /////////////////////////////////////////////////////////////
          // CHECK IT! : "Try with LR" is good for approximation?!
          //  - Careful investigation is needed!
          /////////////////////////////////////////////////////////////
          DvAddr  lrpc = prev_fi->regs[_tgt->Lri()].val;
          COMPENSATE_PC_AHEAD_OF_LR(lrpc);

          if( (REG_VAL_UNDEF != prev_fi->regs[_tgt->Lri()].from)
              && (lrpc != prev_fi->regs[_tgt->Pci()].val) )
          {
            prev_fi->regs[_tgt->Pci()].val = lrpc;
            if(0 > _Set_frame_info(prev_fi) ) { goto fail; }  // set new frame info for retrying with LR.
          }
          else { goto end_of_loop; }
        }
        else { goto end_of_loop; }
      break;

      default:
        ASSERT(FALSE);
    } // switch
    continue;

end_of_loop:
    break;
  } // for(;;)

  if(prev_fi) { MDELETE(prev_fi); }
  if(next_fi) { MDELETE(next_fi); }

  _frame_status = frm_ret;
  return 0;

fail:
  if(prev_fi) { MDELETE(prev_fi); }
  if(next_fi) { MDELETE(next_fi); }
  return -1;
}

'Domain > ARM' 카테고리의 다른 글

[ARM] Multi-core optimization test...  (0) 2011.05.03
[ARM] Sample code for unwinding stack in Thumb mode.  (0) 2010.04.07
[ARM] Unwinding Stack.  (0) 2009.09.15
[ARM] Long jump.  (0) 2007.06.05
[ARM] .init_array section  (0) 2007.03.18

Following code is part of software - written by me - that is used to analyze memory data.
I think this can be good sample to help understand the way of "unwinding stack by pseudo execution". For more details see this post.

static void
_invalidate_variable_registers(UnwRT* r)
{
  for(int i=0; i<13; i++) { r[i].o = UNWR_INVALID; }
}

int
CRawunw::_Unw_thumb(UnwRT* r, unsigned int inst_cnt, unsigned int stack_base) const
{
#define __IS_IN_DUMPED_STACK(aDDR) ( (stack_base>=(aDDR)) && ((aDDR)<=osp) )

#ifdef UNW_ESCAPE_INFINITE_LOOP
  CArr<_CBInfo>   cbi;
#endif // UNW_ESCAPE_INFINITE_LOOP
  unsigned short instr;
  unsigned int   osp = r[13].v; // original sp.
  for(;;)
  {
    if(inst_cnt==0) {
      return UNW_INST_CNT_EXPIRED;
    }
    // PC is still on thumb.
    if(!(r[15].v&0x1)) { return UNW_FAIL; }
    //ASSERT(r[15].v&0x1);

    // Check PC & SP is OK.
    if((UNWR_INVALID==r[15].o)||(UNWR_INVALID==r[13].o)) {
		//ASSERT(FALSE);
		return UNW_FAIL;
    }

    if(stack_base <= r[13].v) {
      return UNW_END_OF_FRAME;
    }

    if(0 > _read2(_user, r[15].v&(~0x1), &instr)) {
      /*ASSERT(FALSE);*/ return UNW_FAIL;
    }

    // Undefined instruction
    if( ((instr & 0xf801) == 0xe801)
#if 0 // followings can be 'defined instrucion' in future!
        || ((instr & 0xff00) == 0xbf00)
        || ((instr & 0xff00) == 0xb100)
        || ((instr & 0xfa00) == 0xb200)
        || ((instr & 0xfc00) == 0xb800)
#endif
        ) {
      // undefined instruction. Unwinding SHOULD BE Stopped!
      return UNW_FAIL;
    }

    //Hi register operations/branch exchange
    else if((instr & 0xfc00) == 0x4400)
    {
      unsigned char    op  = (instr & 0x0300) >> 8;
      char h1  = (instr & 0x0080) ? TRUE: FALSE;
      char h2  = (instr & 0x0040) ? TRUE: FALSE;
      unsigned char    rs = (instr & 0x0038) >> 3;
      unsigned char    rd = (instr & 0x0007);

      // Adjust the register numbers
      if(h2) { rs += 8; }
      if(h1) { rd += 8; }

      if(op != 3 && !h1 && !h2) {
        ODPRINTF(("Invalid Instruction:0x%x", instr));
        return UNW_FAIL;
      }

      switch(op)
      {
        case 0: // ADD
          if((15==rd)&&(13==rd)) { ASSERT(FALSE); return UNW_FAIL; }
          else {
            r[rd].v += r[rs].v;
            r[rd].o = ((UNWR_VALID == r[rd].o) && (UNWR_VALID == r[rs].o) )? UNWR_VALID: UNWR_INVALID;
            ASSERT(UNWR_VALID == r[13].o);
          }
        break;

        case 1: // CMP
          ; // nothing to do!
        break;

        case 2: // MOV
          if((15==rd)&&(14==rs)&&(UNWR_VALID==r[rs].o)) {
            r[15]=r[14];
            return UNW_OK;
          }
          else {
            r[rd] = r[rs];
            ASSERT(UNWR_VALID == r[13].o);
          }
        break;

        case 3: // BX
          // if "14==rs" and it is valid than this is return!
          if(14==rs)
          {
            if(UNWR_VALID==r[rs].o) {
              ODPRINTF(("!!!Return PC=0x%x\n", r[rs].v));
              r[15]=r[rs];
              return UNW_OK;
            }
            else {
              ASSERT(FALSE);
              ODPRINTF(("!!!BX Fail\n"));
              return UNW_FAIL;
            }
          }
          // BX to somewhere. Just ignore it! - do nothing!!
          ODPRINTF(("!!!BX is ignored! [%s] 0x%x\n", (UNWR_VALID==r[rs].o)? "Valid": "Invalid", r[rs].v));
        break;
      }
    }
    //  ADD sp,#+imm
    //  ADD sp,#-imm
    else if((instr & 0xff00) == 0xb000)
    {
      unsigned short value = (instr & 0x7f) * 4;
      /* Check the negative bit */
      if(instr & 0x80) {
        r[13].v -= value;
        ODPRINTF(("*** SP(-) : 0x%x\n", r[13].v));
      }
      else {
        r[13].v += value;
        ODPRINTF(("*** SP(+) : 0x%x\n", r[13].v));
      }
    }
    //  PUSH {Rlist}
    //  PUSH {Rlist, LR}
    //  POP {Rlist}
    //  POP {Rlist, PC}
    else if((instr & 0xf600) == 0xb400)
    {
      char  L     = (instr & 0x0800) ? TRUE : FALSE;
      char  R     = (instr & 0x0100) ? TRUE : FALSE;
      unsigned char     r_list = (instr & 0x00ff);

      if(L) // stack grows in negative direction in ARM.
      {
        for(int i=0; i<8; i++)
        {
          if(r_list & (0x1 << i))
          {
            // value only in dumped stack is valid!
            if( __IS_IN_DUMPED_STACK(r[13].v) ) {
              // Read the word
              if(0 > _read4(_user, r[13].v, &(r[i].v)) ) { ASSERT(FALSE); return UNW_FAIL; }
              r[i].o = UNWR_VALID;
              ODPRINTF(("  r%d = 0x%08x\n", i, r[i].v));
            }
            else { r[i].o = UNWR_INVALID; }
            r[13].v += 4;
          }
        }

        // Check if the PC is to be popped
        if(R)
        {
          // Get the return address
          if(0 > _read4(_user, r[13].v, &(r[15].v)) ) { ASSERT(FALSE); return UNW_FAIL; }
          r[15].o = UNWR_VALID;
          ODPRINTF((" Return PC=%x\n", r[15].v));
          r[13].v += 4;
          return UNW_OK; // success.
        } // if(R)
        ODPRINTF(("*** SP(L) : 0x%x\n", r[13].v));
      } // if(L)
      else
      {
        // Stack grows. But we can't believe the values. So, just increase stack pointer!!
        // Check if the LR is to be pushed
        if(R) { r[13].v -= 4; }
        for(int i=7; i>=0; i--) {
          if(r_list & (0x1 << i)) { r[13].v -= 4; }
        }
        ODPRINTF(("*** SP(!L) : 0x%x\n", r[13].v));
      }
    }
    //  B label
    else if((instr&0xf800) == 0xe000)
    {
      short brv = (instr&0x07ff);
      if(brv & 0x400) { brv |= 0xf800; }

      // Branch distance is twice that specified in the instruction.
      brv *= 2;

#ifdef UNW_ESCAPE_INFINITE_LOOP
      int  idx = -1;
      for(int i=0; i<cbi.Size(); i++) {
        if(cbi[i].pc==r[15].v) { idx=i; break; }
      }
      if(idx>=0)
      { // found
        if(cbi[idx].b_jump) { cbi[idx].b_jump = false; }
        else {
          cbi[idx].b_jump = true;
          // branch
          r[15].v += brv; r[15].v += 2; // prefetch advance.
        }
      }
      else
      {
        // default is "Jump"
        _CBInfo  info(r[15].v, true);
        cbi.Append(info);
        // branch
        r[15].v += brv; r[15].v += 2; // prefetch advance.
      }
#else // UNW_ESCAPE_INFINITE_LOOP
      r[15].v += brv;
      // Need to advance by a word to account for pre-fetch.
      //  Advance by a half word here, allowing the normal address
      //  advance to account for the other half word.
      r[15].v += 2;
#endif // UNW_ESCAPE_INFINITE_LOOP
    }
#ifdef UNW_ESCAPE_INFINITE_LOOP
    // B<cond> label
    else if((instr&0xf000) == 0xd000)
    {
      short immed8  = instr&0x00ff;
      int  idx = -1;
      for(int i=0; i<cbi.Size(); i++) {
        if(cbi[i].pc==r[15].v) { idx=i; break; }
      }
      if(idx>=0)
      { // found
        if(cbi[idx].b_jump) { cbi[idx].b_jump = false; }
        else {
          cbi[idx].b_jump = true;
          // try to jump
          r[15].v += immed8*2;
          r[15].v += 2; // prefetch advance.
        }
      }
      else {
        // default is "Pass"
        _CBInfo  info(r[15].v, false);
        cbi.Append(info);
      }
    }
#endif // UNW_ESCAPE_INFINITE_LOOP
    // ADD <Rd>, PC/SP, #<immed_8>*4
    // It is not essential But, because sp and pc are always 'valid', result is valid.
    // So, I put this. BUT IT'S NOT ESSENTIAL!
    else if((instr&0xf000) == 0xa000)
    {
      unsigned char rd = (instr&0x0700)>>8;
      unsigned short imm8 = (instr&0x00ff)<<2;
      if(instr&0x0800) { r[rd].v = r[13].v + imm8; }    // SP
      else { r[rd].v = (r[15].v & 0xfffffffc) + imm8; } // PC
    }
    // LDR <Rd> [SP, #<immd_8>*4]
    // It is not essential. But, because original stack is 'valid', in most case, result is valid.
    // So, I put this. BUT IT'S NOT ESSENTIAL!
    else if((instr&0xf800) == 0x9800)
    {
      char  rd = instr&0x0700 >> 8;
      short imm8d = instr&0x00ff;
      unsigned int addr = r[13].v+imm8d*4;
      if(addr%4) { ASSERT(FALSE); return UNW_FAIL; }
      if(osp<=addr) { // valid stack.
        if(0 > _read4(_user, addr, &(r[rd].v))) { ODPRINTF(("Cannot read from stack\n")); ASSERT(FALSE); return UNW_FAIL; }
        r[rd].o = UNWR_VALID;  // assume that value from stack or constant!.
      }
      else {
        r[rd].o = UNWR_INVALID;
      }
    }
    else  // invalidate registers
    {
      _invalidate_variable_registers(r);
    }

    r[15].v += 2;
    inst_cnt--;
  }

#undef __IS_IN_DUMPED_STACK
}

'Domain > ARM' 카테고리의 다른 글

[ARM] Multi-core optimization test...  (0) 2011.05.03
[ARM] Sample code for unwinding stack with DWARF2 information.  (0) 2010.04.07
[ARM] Unwinding Stack.  (0) 2009.09.15
[ARM] Long jump.  (0) 2007.06.05
[ARM] .init_array section  (0) 2007.03.18
[[ blog 이사 과정에서 정확한 posting날짜가 분실됨. 년도와 분기 정도는 맞지 않을까? ]]

Everybody knows how useful unwinding stack is.

There are some ways to implement this.

1. By using FP. (at runtime without risk, but limited)
If FP is available, unwinding stack is very easy. Nothing to explain. (MSVC7 uses FP in debug mode build on Intel-CPU-PC.)

2. By pseudo execution.(at runtime with risk.)
If FP is not available, we can get LR values by continuous pseudo execution analyzing opcode.
Prerequisite to do this is "Knowing stack, register and values of code area - instructions". So, after dumping stack and register, we can do this outside process by reading elf and dumped data. Or this can be done inside process at runtime, because at runtime, we know all those. During pseudo execution, we should skip 'function call'; we are not making perfect emulator!. Because of this, we cannot trust register value obtained by calculation or read from memory except for the one from stack; we cannot know things done inside skipped function and returned value. What we can trust is only the value read from stack. And especially, what we are interested in is LR value - usually, located in the stack.

In general, function - sub routine - is consist of prologue, main body and epilogue. In the prologue, function saves previous(caller) state. And at the epilogue, control back to caller state. And, at this moment, PC is back to LR. So, finding epilogue part is significant. As I mentioned, register value this is not read from stack is unreliable. Therefore, operation with these value is meaningless. But focusing on PC, LR and SP is enough for unwinding stack. We should track these 3 values by pseudo execution. Therefore, we can narrow down instructions to those that are affect to above three registers and executing these instruction virtually is efficient and reasonable even if it is not 100% enough.
We can consider handling following instructions in case of ARM Thumb mode.

unsigned short instr;

(instr & 0xfc00) == 0x4400 : Hi register operations/branch exchange.
(instr & 0xff00) == 0xb000 : SP operation.
(instr & 0xf600) == 0xb400 : push & pop
(instr & 0xf800) == 0xe000 : B label.

There is important thing to take care of. During pseudo running, instruction is fetched from code area. But, this unwinding doesn't emulate perfectly. So, sometimes, we may come across unexpected case; Value of pseudo PC is invalid. Unwinding routine tries to read value from the address of pseudo PC. This result in "Data Abort Exception" in ARM. Yes, this is very dangerous. So, we should check that pseudo PC value is valid or not carefully. Or instead of unwinding , we can just dump stack and register values, and then unwind with this data outside process. Dumping stack and register values is not dangerous at all.

Unwinding by pseudo execution can give quite reliable unwinding result. And even without debugging information, we can use this. So, it is very powerful. Example code for unwinding stack with this way in ARM can be found at here.

3. By filtering values in the stack. (at runtime without risk.)
Critical disadvantage of "2" is that there is possibility of crash during pseudo execution.
Someone may need safer one. And that is the way using stack filtering. This is very safe but less accurate; But it gives enough information to guess call stack.
Basic concept is,

At function prologue, usually, LR is stored at the stack. So, most of LRs that can show call stack are somewhere in the stack. So, stack dump already includes call stack. The issue is how can we extract valid information to know call stack. Simple and efficient way is filtering address that in the range of code area from stack dump. Than this will be superset of real call stack.

Even if the result is superset, developer can know which one is dummy value intuitively based on his/her experience. So, this is useful. Above all, this is very safe because the only operation that can be wrong is reading stack value. But software can verify whether SP is valid or not by comparing with TCB. If SP is not valid, we can ignore this to avoid software crash. Same with above (pseudo execution), we also do this outside process with dumped data.

Then, how can we know that address value is in code range or not?
In the process (unwinding at runtime), we can use linker-generating-symbols that indicates memory block. In case ADS or RVCT, we can refer scatter-load file.
Outside process, we can use information of ELF file, Or using memory map file is also fine.

4. Analyzing debugging (ex. DWARF) information with dumped stack. (Impossible at runtime.)
This way can give best information. The amount of information is totally depends on amount of dump. If we dumped entire RAM, we can know every information at that moment; It's just like the state that program is stopped at the break point using interactive debugger. But it is not easy to implement.
We should remind one thing. In case assembly codes, usually, generated debugging information by compiler is less than C. And sometimes, this prevent us from walking call stack. So, we would better to use "2 - pseudo execution - way" together to handling exceptional case like this. That is, if debugging information is available, it is used. But not, unwinding by pseudo execution.

Here is more detail sample example. 

[[ blog 이사 과정에서 정확한 posting날짜가 분실됨. 년도와 분기 정도는 맞지 않을까? ]]

RVCT and ADS make long jump be possible by adding veneer. But, interestingly, link tool of RVCT and ADS, added veneer only when jump to the code located in other execution area. So, in the case that long jump to the same-execution-area-code is needed, linker generates "memory relocation error" in ADS and RVCT. That's why we need to classify code and put them into several execution area.

TI's TMS470 adds veneer('Trampoline' in TMS) even if target code of long jump is in same execution area. But TMS470 is very slow... :-(

I have no idea about gnu arm eabi tools...

[[ blog 이사 과정에서 정확한 posting날짜가 분실됨. 년도와 분기 정도는 맞지 않을까? ]]

".init_array" Section ".fini_array"
.init_array contains pointers to blocks of code that need to be executed when an application is being initialized (before main() is called). It is used for a number of things, but the primary use is in C++ for running static constructors; a secondary use that is sometimes used is to initialize IO systems in the C library.
If you are not using C++ you may (depending on your C library) be able to live without it entirely; but you'd need to hack your startup code to deal with this.
.init_array probably ends up in ram because its marked read/write -- that happens because in a dynamic linking environment the dynamic linker has to fix up all the pointers it contains before it can be used. In a static environment you might be able to get away with forcing it into a read-only section. 

[[ blog 이사 과정에서 정확한 posting날짜가 분실됨. 년도와 분기 정도는 맞지 않을까? ]]

Here are unexpected exceptions that can be occurred in SW(ARM).

  * Data Abort
  * Prefetch
  * Undefined Instruction
  * Divide by zero.

And here is sample code to raise these exceptions.

#pragma O0   /* disable all optimization */

typedef void (*_FuncT)(void);
void _test_data_abort(void)
{
    int   c;
    int*  p = (int*)0xdeaddead;
    c = *p;
}

void _test_prefetch_abort(void)
{
    _FuncT  f;
    f = (_FuncT)0xdeaddead;
    (*f)();
}

static const int _undef_inst = 0xff0000ff;
void _test_undefined_instruction()
{
    _FuncT f;
    f = (_FuncT)(&_undef_inst);
    (*f)();
}

(divide by zero) is omitted

In '_test_undefined_instruction()' , variable '_undef_inst' is located at 4byte-aligned-position. So, program mode becomes ARM mode when "f=(_FuncT)(&_undef_inst);" is executed - BX to 4byte-aligned-address (address & 0x1 == 0x00).

"MSB <--FFxxxxxx--> LSB" area is 'undefined instruction' area in ARM instruction. So, we can make endian-independent-'undefined instruction' by setting '_undef_inst' as 0xff0000ff.

'Domain > ARM' 카테고리의 다른 글

[ARM] Sample code for unwinding stack with DWARF2 information.  (0) 2010.04.07
[ARM] Sample code for unwinding stack in Thumb mode.  (0) 2010.04.07
[ARM] Unwinding Stack.  (0) 2009.09.15
[ARM] Long jump.  (0) 2007.06.05
[ARM] .init_array section  (0) 2007.03.18

+ Recent posts