Real Checko's Blog: ARM

顯示具有 ARM 標籤的文章。顯示所有文章

9.14.2010

奇怪mcu的I2C奇怪波形

悲哀的cheap mcu，送出的 I2C 撥波行徑然是..

可以看到在 START command 前，會送出一個 CLK, SDA 都是 Low 的 plus，然後 start condition 後， SDA 竟然會有一個 high-puls。

雖然這樣沒有違反 I2C wave form standard(?)

ref : wiki

但是有些 chip 就是不吃這樣的波形呀。

能偷就偷，不知道這種 chip design house 的品質是怎樣做出來的...

8.10.2010

Build Assembly code in VS2005

這是參考 tcpmp 的 project 檔。

大概也跟 eVC 一樣，就是為 assembly.asm 加上 custom build command.

先把 assembly code 的 source file 寫好
開啟 VS2005，把 assembly file 加到 source folder 中
在 assembly file 按右鍵，選 properties
Custom Build Step -- General --Command Line,, 寫armasm.exe -cpu xscale "$(InputPath)" "$(IntDir)/$(InputName).obj"
Output 寫$(IntDir)/$(InputName).obj

8.09.2010

CpuId - 紀錄一下

因為 CE 6.0 沒有辦法 SetKMode，所以只好... 寫一個 driver，然後 load 進去..,

這就是 copy 自 tcpmp getcpuid 的 code 後，run 起來：

我們的： 4117B362,1D152152,0,0
測試的： 410FB764,1D992992,0,0

第一個，cpu id

4117B362:ref(http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0360f/CACEDHJG.html)
所以是 ARM1136, rev2

410FB764: ref (http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0360f/CACEDHJG.html)
所以是 ARM1176 , rev 4

第二個，cache type register

ref : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0211k/I67616.html

[31:29] : always 0
[28:25] : ctype - cache type b1110 means "write back","register 7 operation","format C"
[24] : separate D and I cache
[23:12] : D cache size
[11:0] : I cache size

其中 D 和 I cache size encoding format:

[11] : P
[10:9] : always 0
[8:6] : size
[5:3] : association (way)
[2] : M
[1:0] Length

所以
1D152152 : 16K, 4 way, len=32 bytes.
1D992992: 32K, 4 way, len=32 bytes.

driver code

GETID.C:

#include <windows.h>
#include <drvlib.h>


//-----        Code Control        -----
#define GID_DEBUG

#ifdef DEBUG
DBGPARAM dpCurSettings = { TEXT("Getid"), {
    TEXT("Verbose"), TEXT(""), TEXT(""), TEXT(""),
    TEXT(""), TEXT(""), TEXT(""), TEXT(""),
    TEXT(""), TEXT(""), TEXT(""), TEXT(""),
    TEXT(""), TEXT(""), TEXT(""), TEXT("")},
0x00000000
};
#endif

extern void GetCpuId(int,DWORD*);

void SafeGetCpuId(int Id, DWORD* p)
{
memset(p,0,4*sizeof(DWORD));
  GetCpuId(Id,p);
}


//---        GID_Init
DWORD GID_Init(DWORD dwContext)
{
      DWORD  CpuId[4];
      DWORD dwErr;
      TCHAR outmsg[60];

      RETAILMSG(1, (TEXT("[GID]    GID_Init+\r\n")));

      SafeGetCpuId(0,CpuId);
RETAILMSG(1,(TEXT("%X,%X,%X,%X\r\n"),CpuId[0],CpuId[1],CpuId[2],CpuId[3]));
swprintf(outmsg,TEXT("%X,%X,%X,%X"),CpuId[0],CpuId[1],CpuId[2],CpuId[3]);

MessageBox(NULL,outmsg,TEXT("GID"),MB_OK);


RETAILMSG(1, (TEXT("[GID]    GID_Init OK \r\n")));
return TRUE;

}



//---        GID_Deinit
BOOL GID_Deinit(DWORD dwContext)
{
RETAILMSG(1, (TEXT("[GID]    GID_Deinit+\r\n")));


return TRUE;
}



//---        GID_Open
DWORD GID_Open(DWORD dwData, DWORD dwAccess, DWORD dwShareMode)
{
RETAILMSG(1, (TEXT("[GID]    GID_Open+\r\n")));

return dwData;
}



//---        GID_Close
BOOL GID_Close(DWORD Handle)
{
RETAILMSG(1, (TEXT("[GID]    GID_Close+\r\n")));

return TRUE;
}



//---        GID_Read
DWORD GID_Read(DWORD Handle, LPVOID pBuffer, DWORD dwNumBytes)
{
RETAILMSG(1, (TEXT("[GID]    GID_Read+\r\n")));
return 0;
}



//---        GID_Write
DWORD GID_Write(DWORD Handle, LPCVOID pBuffer, DWORD dwNumBytes)
{
RETAILMSG(1, (TEXT("[GID]    GID_Write+\r\n")));


return 0;
}



//---        GID_Seek
DWORD GID_Seek(DWORD Handle, long lDistance, DWORD dwMoveMethod)
{
RETAILMSG(1, (TEXT("[GID]    GID_Seek+\r\n")));

return (DWORD) -1;
}



//---        GID_IOControl
BOOL GID_IOControl(
          DWORD Handle,
          DWORD dwCode,
          PBYTE pBufIn,
          DWORD dwLenIn,
          PBYTE pBufOut,
          DWORD dwLenOut,
          PDWORD pdwActualOut
          )
{

RETAILMSG(1, (TEXT("[GID]    GID_IOControl+\r\n")));
return (FALSE);
}

BOOL
GID_DllEntry(
HINSTANCE   hinstDll,             /*@parm Instance pointer. */
DWORD   dwReason,                 /*@parm Reason routine is called. */
LPVOID  lpReserved                /*@parm system parameter. */
)
{
if (dwReason == DLL_PROCESS_ATTACH)
{
 DisableThreadLibraryCalls((HMODULE) hinstDll);
}


return (TRUE);
}

arm.s

    INCLUDE kxarm.h

EXPORT GetCpuId

TEXTAREA

LEAF_ENTRY GetCpuId
  export GetCpuId
mrs    r0,cpsr

and r0,r0,#15
cmp r0,#15
bne UserMode
mrc p15,0,r0,c0,c0,0
nop
nop
mrc p15,0,r2,c0,c0,1
nop
nop
str r0,[r1,#0]
str r2,[r1,#4]
UserMode
mov    pc,lr

ref (http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0360f/CACEDHJG.html) CpuID 取的是 opcode2 = 0 和 1 : cpu id 跟 cache type.
GETID.DEF

LIBRARY getid

EXPORTS
GID_Init
GID_Deinit
GID_Open
GID_Close
GID_Read
GID_Write
GID_IOControl

sources


TARGETNAME=getid
TARGETTYPE=DYNLINK
RELEASETYPE=PLATFORM

DLLENTRY=GID_DllEntry
DEFFILE=$(_TARGETPLATROOT)\SRC\DRIVERS\getid\getid.def

PREPROCESSDEFFILE=1
SYNCHRONIZE_DRAIN=1
WINCEREL=1

TARGETLIBS= \
$(_COMMONSDKROOT)\LIB\$(_CPUINDPATH)\coredll.lib \

SOURCELIBS= \

SOURCES= \
getid.c \
arm.s

load driver 的程式就是以前哪個 activedriverex( ). 改一下 dll name 就可以

ref http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0211k/I67616.html
用

c0, Core feature ID registers

好像可以得到更多 cpu feature

8.05.2010

pipeline bubble

利用 assembly 來optimize speed 時，要注意，並不是 instruction line 短就夠了。
還要注意 pipline 的特性。

pipeline 將指令分成幾個 stage：

fetch,
decode/register read,
alu,
memory write,
register update.

如果這樣個 assembly codes:

 ldr r0,#1
add r3,r0,r1

第二行指令在 decode/register read 時，r0 還沒update，因為上一行才執行到alu operation。所以在pipeline中的第二行指令就要停下來，等兩個clock，等上一行指令執行到register update後，才可以繼續。
這樣，就白白浪費 2 個 clock 了。

為了避免這樣的情況，寫 assembly 的時候，就可以在中間插入下面要作的 assembly code，不要白白浪費這兩個clock

在 (很久)前面的文章：yuv - rgb color space convert 的 assembly code 可以看到。最後一個作者的assembly code 就有做到這個(所以整個 code 的 flow 變得不好trace)。實測的結果，也是最快的。

在 Wiki 里也有說明：

instruction scheduling: Instruction scheduling is an important optimization for modern pipelined processors, which avoids stalls or bubbles in the pipeline by clustering instructions with no dependencies together, while being careful to preserve the original semantics.

8.04.2010

cache

cache 是一小塊和cpu時脈一樣的記憶體，作為 cpu 與主記憶體間的 buffer。
當 cpu 要讀取 memory 時， cache 先把 memory 的資料讀進 cache 中，然後再讓 cpu 讀取。

cpu 每次讀取的單位是 int (32bit)，但是 cache 為求效能，會一次讀入比 int 還多的資料。
cache 每次讀入的資料大小叫 line size，每次讀入的資料放入一個 line。

cache 就是利用這樣預先讀取的能力，讓cpu 下次需要讀取記憶體時，可以直接由 cache 拿，不需要再向主記憶體讀取。

由於 cache 只有小小一塊，所以必須要有一個機制知道 cpu 需要存取記憶體時，那個 address的內容是不是已經在cache 里，如果在，是在 cache 的那一條 line ...

最直覺的方法就是去找每一個 cache line 的 address，看看那一個 line 所以 cache 的記憶體位址是不是 cpu 現在要的。

這樣個 cache 叫 associative cache.

但是這樣作很耗空間(電路)，因為每一個 line 都要作一個 address比較電路。
所以就有比較便宜的作法.. 把把 address 分成兩部份：

tag, index

以 32 bit 的 address line 為例，可以分成：

tag : 31~20
index : 19~0

這樣要找cache 時就可以反過來用，用 address的 0~19作 index，找到那個 cache index line 的 tag，然後跟 31~20比較就可以。

後來又有折衷的辦法，就是...每個 index 可以有兩個 tag欄位和lines，這樣，每20 條位址線，就可以共用兩條 cache line。 -- 這就叫 2 way cache.
=>這樣就是：拿 index 找到對應的 2 個 tag ，比較和 address bit 31-20 一步一樣。

一此類推，可以有 4 way cache。

這一篇 (http://www.mirabilisdesign.com/Pages/Demonstrations/systemarchitecture/Cache_System/Functional_Cache_Model_Overview.html)雖然是 sim software 的說明，但是可以看到 ARM11 cache 的 impelement 圖例

cache line size 是 16 words (address 0~3), index size 是 4096 (address 4~15), tag filed 是 address 16~31，

所以 cache index size x byte size x way = cache size

7.19.2010

VFP in CE 6.0

有關 CE 6 的 ARM VFP 支援

這一篇 google groupe討論說

The VS2005 compiler used by PB for CE6.0 emits no VFP instructions and
unless you implement FPCRT.DLL for an ARM then all CE implementations
are integer (default MS implementation of FPCRT uses software
emulation), even if you have a hardware VFP in your core (e.g. ARM11
or Cortex-A8). To actually use the VFP you must do the following:

1. Use SYSGEN_OEM_FPCRT - catalog item "OEM Floating Point CRT (ARM
only)
2. Download the VFPv2 library support from the ARM website and
incorporate it into your build:
www.arm.com/products/os/windowsce_vfp_dl.html

Instructions are included with the download from ARM.

My tests show a 4x increase in performance on floating point
operations on a release build on an ARM1136JF-S core.

Andrew.

也就是說，CE 6 的 VFP driver(?) library f是由 ARM 提供的。

還有這一篇 MSDN 討論，說

The current version of Windows CE supports the ARMV4I architecture. Cortex A8 it's ARMv7 (ARM's naming can be even more confusing than ms one...) and currently the compiler does not use its specific features (NEON, VFP3 etc.).

The new release of Windows CE (named Windows Embedded Compact 7) will support the ARMv7 architecture.

Here you can find some information about the new features of this release:

http://www.microsoft.com/windowsembedded/en-us/products/windowsce/compact7.mspx

You can download a public beta from connect.microsoft.com.

If your project has not a very short time frame (you plan to release your device Q2 next year, for example), you may consider using this new release for your development to be able to "unleash" all the power of your CPU core.

Valter Minute
Windows Embedded MVP
http://geekswithblogs.net/WindowsEmbeddedCookbook

所以你藥用 CE 6 寫 VFP 或是 NEON 的 code 是不可能的，要等 Windows Embedded 7.

總合起來，可以要看一下 msdn :
Including a Replacement Floating Point C Run-Time Library in a Run-Time Image
還有這一篇的實做 copy 下來，免得不見：

Download and install the "ARM® VFPv2 Floating Point Support Library" from ARM.

If building with Platform Builder to build your OS image

simply add the FPCRT project to your solution, and set the following catalog feature

EVM_3530\Core OS\CEBASE\Applications and Services Development\C Libraries and Runtimes\OEM Floating Point CRT (ARM only)

A SYSGEN is required.

If building without Platform Builder, ensure the SYSGEN variable is set. I added the following to my "tinykernel DEBUG.bat" file.

set SYSGEN_OEM_FPCRT=1

Add entries to platform.bib

fpcrt.dll            $(_FLATRELEASEDIR)\fpcrt.dll               NK  SH
k.fpcrt.dll          $(_FLATRELEASEDIR)\fpcrt.dll               NK  SHMK

I copied the contents of %WINCEROOT%\OTHERS\ARM\VFPv2\src\ARMVFPv2\obj to my prebuilt directory.
Then copied the DLL project to my %WINCEROOT%\OTHERS\ARM\VFPv2\src\FPCRT to my %PLATFORMROOT%\%_TGTPLAT%\src\drivers directory.
Modified dirs, and updated sources to point to new library location.

A SYSGEN is required.

Visual Studio 2008 VFP Support

VS2008 ARM compiler supports emulated and hardware floating point. To enable hardware based Vector Floating point you need to use this option

/QRfpe-

This means not floating point emulation.

If you use

/QRfpe

it will generate software floating point, as it means floating point emulation.

Microsoft KB 947894

Note: This is not related to NEON support

有關 multimedia (video/audio/graphic) 部份，ARM 有提供 OpenMax 的實做 (實際用 ARM11, Cortex A8 feature optimize) 的 library
Khronos Standards
...但是依照慣例，ARM的網頁連結半年後一定又會改 >_<

要註冊才能download..
喔？是以 library 的方式提供，用 realview compile (當然不是 VS...)

6.04.2010

CPU 與週邊的速度

CPU 跑 600，週邊 IO 的 clock 是 60。
所以當 CPU 對 GPIO register 寫資料時，是同步在 60MHz...


 ldr   r0, =GPIO1_PAD_EN
 mov   r1, #0x00100000 ; pin 20
 str   r1,[r0]
 ldr   r0, =GPIO1_CTRL20
 mov   r1,#0x120
 mov   r2,#0x160
1 
 str   r1,[r0]
 nop
 str   r2,[r0]
 nop
 b     %b1

這一段 code，有沒有加 nop 輸出的方波都是 10MHz。
但是把 nop 改為


 str   r1,[r0]
 str   r1,[r0]
 str   r2,[r0]
 str   r2,[r0]
 b     %b1

這樣改完，輸出方波變成 5MHz。

也就是說..如果一直 polling 週邊得的話，就等於是用 60MHz 在跑....

11.24.2009

ARM cp15 c1,c0, 0 - control register configuration

ARM 的coprocessor cp15 ，可以用


mrc/mcr  p15,0,,Cn,Cm,N

來溝通 (read/write)
參考： cp15 instruction

比較常用的：Control Register Configuration:


MRC p15, 0, , c1, c0, 0  Read Control Register configuration data
MCR p15, 0, , c1, c0, 0 Write Control Register configuration data

Control Register 內容參考：c1, Control Register
-- copy 過來，免得以後改位置--

Table 3.24. Control Register bit functions

Bits	Field	Function
[31]	SBZ	This field returns a Unpredictable value when read. Should Be Zero.
[30]	TE	Determines the state that the processor enters exceptions: 0 = Exceptions entered in ARM state 1 = Exceptions entered in Thumb state.
[29]	SBZ	Should Be Zero. This bit reads as 0 and ignores writes.
[28]	SBZ	Should Be Zero. This bit reads as 0 and ignores writes.
[27]	NMI	Determines the state of the non-maskable bit that is set by a configuration pin FIQISNMI: 0 = The processor is backwards compatible and behaves as normal 1 = All attempts to modify the CPSR F bit can only clear it. There is no way to set it in software. The SPSRs remain freely modifiable but copying the SPSR to CPSR can only clear the F bit. FIQs continue to set the F bit automatically. Note The status of the FIQISNMI pin is read by Bit 27. Software cannot write to Bit 27.
[26]	SBZ	Should Be Zero. This bit reads as 0 and ignores writes.
[25]	EE	Determines how the E bit in the CPSR bit is set on an exception: 0 = CPSR E bit is set to 0 on an exception 1 = CPSR E bit is set to 1 on an exception. The reset value depends on external signals, see Table 3.25.
[24]	VE	Enables the VIC interface to determine interrupt vectors: 0 = Interrupt vectors are fixed 1 = Interrupt vectors are defined by the VIC interface. See the description of the V bit, bit 13.
[23]	SBZ	Should Be Zero. This bit reads as 0 and ignores writes.
[22]	U	Enables unaligned data access operations for mixed little-endian and big-endian operation: 0 = Unaligned data access support disabled 1 = Unaligned data access support enabled. The A bit has priority over the U bit. The reset value of the U bit depends on external signals, see Table 3.25.
[21]	FI	Configures low latency features for fast interrupts. 0 = All performance features enabled. 1 = Low interrupt latency configuration enabled.
[20]	SBZ	Should Be Zero. This bit reads as 0 and ignores writes.
[19]	SBZ	Should Be Zero. This bit reads as 0 and ignores writes.
[18]	SBO	Should Be One. This bit reads as 1 and ignore writes.
[17]	SBZ	Should Be Zero. This bit reads as 0 and ignores writes.
[16]	SBO	Should Be One. This bit reads as 1 and ignore writes.
[15]	L4	Determines if the T bit is set for PC load instructions: 0 = Loads to PC set the T bit. 1 = Loads to PC do not set the T bit, ARMv4 behavior. For more details, see the ARM Architecture Reference Manual.
[14]	RR	Determines the replacement strategy for the cache: 0 = Normal replacement strategy by random replacement 1 = Predictable replacement strategy by round-robin replacement.
[13]	V	Determines the location of exception vectors: 0 = Normal exception vectors selected, address range = `0x00000000-0x0000001C` 1 = High exception vectors selected, address range = `0xFFFF0000-0xFFFF001C`.
[12]	I	Enable or disable level one instruction cache: 0 = disabled 1 = enabled.
[11]	Z	Enables programme flow prediction: 0 = Program flow prediction disabled 1 = Program flow prediction enabled.
[10:8]	SBZ	Should Be Zero. This bit reads as 0 and ignores writes.
[7]	B	Determines operation as little-endian or big-endian memory system and the names of the low four-byte addresses within a 32-bit word: 0 = Little-endian memory system 1 = Big-endian word-invariant memory system. The reset value of the B bit depends on external signals, see Table 3.25.
[6:3]	SBO	Should Be One. This field read as 1 and ignore writes.
[2]	C	Enables or disables level one data cache: 0 = Data cache disabled 1 = Data cache enabled.
[1]	A	Enables strict alignment of data to detect alignment faults in data accesses: 0 = Strict alignment fault checking disabled. 1 = Strict alignment fault checking enabled. The A bit setting takes priority over the U bit.
[0]	M	Enables or disables the MPU: 0 = MPU disabled 1 = MPU enabled.

6.10.2009

iMX31 PDK build Image and upload

紀錄一下，

iMX31 3 stack board 又叫做 PDK (platform Developement Kit)。
serach IMX31

IMX31_SDK_14 : IMX31_SDK_WINCE5_BSP.msi

需要上 CE QFE patch 到 2008 12 rollup。

安裝一個platform : C:\WINCE500\PLATFORM\3ds

Platform 是 C:\WINCE500\PBWorkspaces\3dsmobility

依照 IMX31_PDK14_UG.pdf 的說明，install, build image:

因為有上 CE QFE，所以要手動 build ms sd driver
手動build csp
sysgen

Build XLDR 和 EBOOT

手動 build. 這兩個 folder 是不會自動 build 的。

燒錄 image 到 PDK 的tool 是 ATK

IMX_ATK_TOOLKIT_R1_66

燒錄 XLDR.NB0

調整 PDK debug board 的 SW (紅色)，由左到右，依次是：0.0.0.0.0.0 (download mode)
接 RS232 到 debug board，開機
啟動 ATK - 選 iMX31 TOC02, DDR, Go
第二頁選 Flash Tool，Next
NAND - K9K2G08U0A,
Erase
Program - XLDR.NB0 (start address : 0x0000)
Program - EBOOT.NB0 (start addrss : 0x20000)

完成，調整 debug board SW 0.1.0.0.0.0 (Nand boot) - 開啟 hyper terminal 進入 EBOOT。

6.01.2009

RealView ICE and gdb

參考 RealViewICE Guide
手動安裝 RealView ICE for Linux (其實是 redhat)。
裝完後，會在 /opt/ARM/RVI

執行 /opt/ARM/RVI/RVI/.../linux-pentium/rvconfig 會出現跟 windows 一樣的GUI。
follow windows 的作法，identify 出 JTAG scanchain 上的 cpu.
save : rvi.rvc - copy 到 home 比較方便

執行 /opt/ARM/RVI/RVD/Tools/..../rvigdbconfig -f rvi.rvc

reading file rvi.rvc
Done
connecting to 192.168.144.185
found 192.168.144.185
connected
finished

192.168.144.185 就是 RealView ICE 的 ip

這樣就可以啟動 arm-gdb 了 (用 CodeSourcer) : arm-none-linux-gnueabi-gdb

(gdb) target remote 192.168.144.185:4000
Remote debugging using 192.168.144.185:4000
0x00000000 in ??()
(gdb)

5.26.2009

Worklog : iMX31 Demoboard, buid WINCE5.0

iMX31 3 STACK Debug board:

Nand : K9F2G08R0A - 256MB
DDR 128MB

Boot From Nand 的 Dip SW 設定：

SW5 : 0
SW6: 1
SW7: 0
SW8: 0
SW9: 0
SW10: 0

Demoboard 使用 iMX31 和 MC13783 (PMIC with Audio AMP)。

MC13783 的 size 幾乎和 iMX31 一樣大

CSP 放在 C:\WINCE500\PUBLIC\COMMON\OAK\CSP\FREESCALE

MX31 : MX31 相關
MXARM11 : 所有MX 系列中 arm11 相關

CE 的 PQOAL 架構，又把一些 oal 放在其他地方 C:\WINCE500\PLATFORM\COMMON\SRC\ARM\FREESCALE\

依照順序來..

Build CSP:

因為 CSP 不會 auto build (是因為在 public 的關係 ?)，所以要手動 build。
到 file，favorite，CSP 按右鍵，check "clean before build" then do "build current project"

大概就是到 public\common\oak\csp\freescale 下面作 build -c

build 好的 lib 會在 C:\WINCE500\PUBLIC\COMMON\OAK\LIB\ARMV4I\DEBUG

接下來 build BSP (for the 1st time)
build OS menu :

clean before building : X
Copy Files to Release Directory After build : V
Make Run-Time Image After Build : V

然後就按下 "Sysgen"

要 Clean build 整個 BSP (CSP, PQOAL, BSP) 的話：

build CSP - follow 前面的步驟，按右鍵...
build PQOAL

Clean Before Building : V
Copy Files to Release Directory : V
Make Image After Build : X
執行 Build OS - sysgen

build BSP

Clean Before Building : V
Copy Files to Release Directory : V
Make Image After Build : V
執行 Build OS - Build and sysgen current BSP

如果只有某部份的 code 修改，可以只build 那部份的 code，然後執行 Build and sysgen current BSP

到 Files :

CSP 有修改 : 到修改的 folder 按右鍵- check "Clean before Build" 然後 "Build and sysgen current BSP"
PQOAL : 一樣，到 PQOAL folre 按右鍵

然後：

Clean before Build : X
Copy Files to Release Directory After Build : V
Make Run time image after Build : V
執行 : Build and Sysgen Current BSP

5.13.2009

openmoko - flash kernel and rootfs

安裝 usb 連線軟體： dfu-util。
Debian 直接用 apt 安裝就可以。

通常需要

kernel - uImage
rootfs

bootloader 不一定要重燒

rootfs 通常是有 jffs2 在檔名中，kernel 通常就有 uImage 在檔名中。

開機進入 nor boot (nand boot) ? 就可以接上 usb ，和 dfu-util 溝通。

按著hold key 開機，進入nor boot 畫面，插上 usb。dmesg..


[786627.436013] usb 2-3: new full speed USB device using ohci_hcd and address 2
[786627.654996] usb 2-3: configuration #1 chosen from 2 choices
[786627.677190] usb 2-3: New USB device found, idVendor=1d50, idProduct=5119
[786627.677198] usb 2-3: New USB device strings: Mfr=1, Product=2, SerialNumber=3
[786627.677202] usb 2-3: Product: Neo1973 Bootloader U-Boot 1.3.2-moko12
[786627.677205] usb 2-3: Manufacturer: OpenMoko, Inc
[786627.677207] usb 2-3: SerialNumber: 0000000
[786628.281641] cdc_acm: This device cannot do calls on its own. It is no modem.
[786628.281641] cdc_acm 2-3:1.0: ttyACM0: USB ACM device
[786628.288104] usbcore: registered new interface driver cdc_acm
[786628.288112] cdc_acm: v0.26:USB Abstract Control Model driver for USB modems and ISDN adapters

所以應該是有偵測到...

燒錄過程要注意的是...沒有操作30 sec 後，機器會自動 power off。
所以看到螢幕黑掉，就要重新 power on.

rootfs download 比較久。

重開機很花時間，約需要 2 min，約 1min 後才會出現splash screen.然後滅掉..

openmoko new freerunner - boot into nand and nor

Tony新買的Neo Freerunner 換OS後無法開機，停在 openmoko boot screen。
所以可以讓我玩一下..

先follow instruction，檢查一下nor boot 可不可以boot:

.按左上角hold button不放
按右下角power button開機button不放,, 開機
放掉左上角hold button
放到右下角power button

這樣就開進 nor boot。可以看到 lcd 顯示 nor boot version 和 menu。

接著檢查 nand boot 可不可以 boot:

按著右下角 power button 不放
馬上按左上角 hold button，維持 5~8 sec,,開機
都可以放開了

這樣就開進 nand boot，可以看到 lcd 顯示 nand 和 menu

4.22.2009

build new rev openocd in msys

新rev 的 openpcd 需要作 bootstrap，(也就是說，需要 aclocal, autoconf, automake..)

follow 這一篇..安裝 msys, minGW 環境。

msys - 先安裝 1.0.10
msys-DTK : 是build 需要的一些 tool，像 perl, autoconf, automake... etc
然後 download msys-core-1.0.11，解開後，覆蓋 msys的安裝目錄。

msys 的 update 就是 untar , overwrite ?

之後安裝新版 autoconf, automake, libtool。
安裝方式都是 download ~~mingw patch 過的~~ source code, /.configure , make , make install。

http://ftp.gnu.org/gnu/autoconf/autoconf-2.61.tar.bz2

http://ftp.gnu.org/gnu/automake/automake-1.10.tar.bz2

http://ftp.gnu.org/gnu/libtool/libtool-1.5.24.tar.gz

這些tool 會install 在 /usr/local/bin，和原先的 tool 位置 (/bin) 不同。

OK !

可以到 openocd 下作 bootstrap，configure and make.

因為已經是在 mingw 下作，所以 configure 時不用加 CC="gcc -mno-cygwin"
只需要加 -enable-ft2232_libftdi

configure 發生exception ，說找不到 linusb0.dll ，是因為沒有install libusb-win32 driver 的關係，download 那個有 "filter" 字樣的 libusb-win32-exe 下來安裝就可以。
bootstrap 發生 Can't locate object method "path" via package "Autom4te::Request" at /usr/bin/autom4te line 81.

說明是說把 autom4te.cache 刪掉就可以。
的確是這樣，刪掉之後，這個 error 就沒了，但是出現其他的 error。

其實這是 autom4te 版本不一致的關係，安裝過新版的 automake 後就 OK 了 (就不會出現這個 error).

build openocd under mingw - with ftd2232 chip interface

(有 update 喔)
openocd 在 mingw 下 build..

先要 install mingw(gcc for win) 和 msys(unix shell for win)

因為安裝 msys 會尋問 mingw在哪，所以要先install mingw:

mingw 現在以經有像 cygwin那樣的網路安裝了，download MinGW-5.1.4.exe
http://sourceforge.net/project/showfiles.php?group_id=2435&package_id=240780

執行後就會出現問題，選安裝，然後而外的package 選 g++ 和 make..
之後就會自己 download 和安裝完畢

我裝在 C:\MinGW

裝完後要手動把 c:\mingw\bin 加到 cmd 的環境變數。

這樣裝完後，就可以開啟cmd.exe，run gcc 了

但是要run make 和 bash file需要 msys。 download msys-1.0.10.exe
http://downloads.sourceforge.net/mingw/MSYS-1.0.10.exe

直接執行就可以，中途會問你postinstall，回 y。
然後問你有沒有 MinGW，回答有，並且把剛剛install MinGW的 path 寫出來
！！但是因為在 unix 中，目錄的左右斜線是相反的，要注意。
裝完後就可以在 programfile - MinGW- msys 啟動 msys 環境。

這個openocd 用 ftd232 (usb-232).
所以要 libftdi.a
libftdi.a 會用到 libusb.a

因為是 for windows 版，所以要dowload windows 版 libusb - libusb-win32-device-bin-0.1.12.1.tar.gz
http://libusb-win32.sourceforge.net/

下載的是bin 檔 (prebuild)，所以把 usb.h copy 到 /mingw/include 把 libusb.a copy 到 /mingw/lib

還要安裝 libusb-win32 的 dll : libusb-win32-filter-bin-0.1.12.1.exe
http://sourceforge.net/project/downloading.php?group_id=78138&filename=libusb-win32-filter-bin-0.1.12.1.exe&a=96392657

這樣就可以開始 build libftdi.a - download libftdi-0.15.tar.gz
http://www.intra2net.com/en/developer/libftdi/download.php

解開，到libftdi-0.15 下， /.configure 然後 make
會出現 error，但是 check 一下 error message，會發現 libftdi.a build 完，是 build example 時有 error，所以不管。

把 src/ftdi.h copy 到 /mingw/include
把 src/.libs/libftdi.a copy 到 /mingw/lib

最後就要 build openocd 了..

download trunk 的版本 (release 版支援的 cpu 比較少..)
解開， run ./bootstrap
./configure -enable-ft2232_libftdi CC="gcc -mno-cygwin"
make

2.18.2009

AVPicture structure 的內容 - 還是 convert_yuv420_rgb565.S

tyedef struct AVPicture {
uint8_t *data[4];
int linesize[4];
} AVPicture;

是給說明一個 frame 的資料內容。
data 是 channel data.
linesize 是channel 的 linesize (寬度).

所以最多有 4 個 channel 資料 (? 哪一種 format 會有四個 channel ?).
以 YUV420 為例，320 x 240 的圖檔，有 Y.U.V 三個 channel，Y 的 linesize 是 320。U, V 的 linesize 都是 160.
所以用到 data[0]~data[3]，linesize[0]~linesize[3]。

以 bgr565為例，一個 pixel 一個 word：bgr565，所以沒有分開 channel。所以只有用到 data[0] 和 linesize[0]。

所以 convert _yuv420_rgb565.S 的一開始的 argument 處裡：


   ldr r7,  [r0,  #0]       ; Y ptr
   ldr r9,  [r0,  #4]       ; U ptr
   ldr r10, [r0,  #8]       ; V ptr
  subs r10, r10, r9        ; V ptr - U ptr
  ldr r8,  [r0, #12]
  add r8, r8, r7           ; Y + stride_Y
   ldr r4,  [r0, #12]       ; Stride_Y
  mov r4, r4, lsl #1
  sub r4, r4, r2           ; (2 * Stride_Y) - width
   ldr r5,  [r0, #16]       ; Stride_U
  sub r5, r5, r2, lsr #1   ; Stride_U - (width / 2)
   ldr r6,  [r0, #20]       ; Stride_V
  sub r6, r6, r2, lsr #1   ; Stride_V - (width / 2)
  add r0, r1, r2, lsl #1   ; RGB + 1
  stmdb   sp!, { r0-r10 }
; Stack description :
; (sp+ 0) RGB + one line
; (sp+ 4) RGB
; (sp+ 8) width (save)
; (sp+12) height
; (sp+16) (2 * stride_Y) - width
; (sp+20) stride_U - (width / 2)
; (sp+24) stride_V - (width / 2) !!! UNUSED !!!
; (sp+28) Y ptr
; (sp+32) Y ptr + one line
; (sp+36) U ptr
; (sp+40) V - U

就是分別取出
data[0] : Y, data[1] : U, data[2] : V
和
linesize[0] : stride_Y, linesize[1] : stride_U, linesize[2] : stride[V].

那

convert_yuv420_rgb565(AVPicture *pic,unsigned char *out,int width,int height)

的參數 width, 和 Stride_Y 有什麼不一樣？

Stride_Y 是 source 影像的寬度。
width 是你要顯示的寬度 (clip)。

有一張 1024x768 的圖，你可以只顯示 800x768 (左邊部份)。

所以不一樣.

從 assembly 看，AVPicture 的 structure 好像不是這樣，他用的好像是：

struct AVPicture{
  char *data[3];
  int  linesize[3];
} AVPicture;

實際上...也是用這個 structure Run 起來的...

2.17.2009

Optimize - learned from yuv420_rgb565

所以optimze 的方法就是..

optimze with speed
盡量使用 mla ( a = (b*c)+d))
盡量不使用 condition jump
反組譯
exam asm - 盡量使用所有 register
打亂 instruction. 混合 register only arithematic 和 data load/store

所以應該要try 一下：

先照 assembly algorithm 寫出 c code. - cpu % , 1000 loop time.
反組譯, 寫成 .S link - 確認 work
follow yuv420_rgb565.s 略為修改，但不作 scramble - cpu %, 1000 loop time.
作 scamble, 混合 load/store 和 register only instruction. - cpu %, 1000 loop time

如果有空要這樣作.....

Code Reading - 上一篇的YUV - RGB assembly

最後的 table : rb_clip_dummy, rb_clip, 還有 g_clip_dummy, g_clip。
dummy 都是0，然後兩個加起來， rb 是 16 x 8, g 是16 x 16。
這個大概是跟 paper 講的一樣，用 table 取代 saturation 判斷。
0 的部份就是負值的部份。

所以整個 assembly code 中沒有 compare jmp 指令 (除了最後的 line end 判斷)。
YUV 計算部份直接用 mla ( x +)，沒有使用 table -- 大概是因為 ARM 作 16x16 只要1 個 clock，所以沒必要。

一樣是作 w (寬度)，然後 line.. 一次(loop) 4 pixel。

因為先作 bit shift (5-6-5)再做查表，所以table[]不用太大。

code 沒有避免使用 mul (mla)，反而大量使用，避免不需要的 ldr 動作 (大概是 ldr 和 mla 都是一個 clk 吧)。其中：

r8 : multy 0x00012A15
r9 : (Y-16)
r6 : Coef *(V-128) + 32768
r5 : Coef*(U-128) + 32768
r4 : -Coef *(U-128) - Coef*(V-128) + 32768

剩下的就..

原來 eVC 的 ARMASM 最佳 example code 就是 OS bsp 下所有的 .s file。
所以語法參考 BSP 就可以了。
MSDN : CE .NET 4.2 ASM

大概要改的是：

MS armasm 規定只有 label 可以從一行的第一格開始，所以所有其他的 instruction, directive 都要先空
comment 是以 ; 開頭
label 不可以加 : 號
.byte 改為 DCB
.word 改為 DCD
.text 宣告要改為 AREA |.text|,CODE, ARM
.global 要改 EXPORT

reference
http://checko.blogspot.com/2006/09/writting-arm-assembly-in-embedded-vc_28.html ：

手動加入 yuv420_rgb565.s
project setting - yuv420_rgb565.s - custom build : 填入 armasm ... (Debug, Relese 都要加)

引用的 .cpp 加入：

 extern "c" void convert_yuv420_rgb565(char *,char*,int,int);

Q_Q ,, argument passing type 不一樣...

這個 assembly code 有"適當安排過". 可以看到：
有 memory access 的 instruction，接著的會是 register-only 的 operation。
這樣instruction pipeline 就可以沒有阻礙的run 下去 (如果一個 load 下一個 store, pipe line 要等 cache/memory 的同步?)
作法是：要使用之前，在 n 個 instruction 前就 load...

所以下面是重新安排後的 code，變得比較容易看....

;
;     void convert_yuv420_rgb565(AVPicture *picture, unsigned char *results, int w, int h) ;
;

AREA |.text|,CODE,ARM

EXPORT convert_yuv420_rgb565

convert_yuv420_rgb565
   stmdb   sp!, { r4 - r12, lr }   ; all callee saved regs
   ldr r7,  [r0,  #0]       ; Y ptr
   ldr r9,  [r0,  #4]       ; U ptr
   ldr r10, [r0,  #8]       ; V ptr
   subs r10, r10, r9        ; V ptr - U ptr
   ldr r8,  [r0, #12]
   add r8, r8, r7           ; Y + stride_Y
   ldr r4,  [r0, #12]       ; Stride_Y
   mov r4, r4, lsl #1
   sub r4, r4, r2           ; (2 * Stride_Y) - width
   ldr r5,  [r0, #16]       ; Stride_U
   sub r5, r5, r2, lsr #1   ; Stride_U - (width / 2)
   ldr r6,  [r0, #20]       ; Stride_V
   sub r6, r6, r2, lsr #1   ; Stride_V - (width / 2)
   add r0, r1, r2, lsl #1   ; RGB + 1
   stmdb   sp!, { r0-r10 }
   ; Stack description :
   ; (sp+ 0) RGB + one line
   ; (sp+ 4) RGB
   ; (sp+ 8) width (save)
   ; (sp+12) height
   ; (sp+16) (2 * stride_Y) - width
   ; (sp+20) stride_U - (width / 2)
   ; (sp+24) stride_V - (width / 2) !!! UNUSED !!!
   ; (sp+28) Y ptr
   ; (sp+32) Y ptr + one line
   ; (sp+36) U ptr
   ; (sp+40) V - U
   mov lr, r2                         ; Initialize the width counter
   add r0, pc, #(const_storage-.-8)   ; r0 = base pointer to the constants array
   ldr r8, [r0, #(4*4)]               ; r8 = multy  
yuv_loop

   ldr r10, [sp, #28]                 ; r10 = Y
   ldrb r9, [r10, #0]                 ; r9 = *Y

   add r10, r10, #2                   ; r10 = Y + 2
   str r10, [sp, #28]                 ; save Y + 2

   ldr r1, [sp, #36]                  ; r1 = U
   ldrb r11, [r1]                     ; r11 = *U
   add r1, r1, #1                     ;; r1 = U++
   str r1, [sp, #36]                  ; store U++

   ldr r2, [sp, #40]                  ; r2 = V - U
   add r2, r1, r2                     ; r2 = V+1
   ldrb r12, [r2, #-1]                ; r12 = *V

   sub r11, r11, #128                 ; r11 = *U - 128
   sub r12, r12, #128                 ; r12 = *V - 128

   mov r7, #32768                     ; r7 = 32768 (for additions in MLA)

   add r0, pc, #(const_storage-.-8)   ; r0 = base pointer to the constants array
   ldr r1, [r0, #(4*0)]               ; r1 = crv
   mla r6, r1, r12, r7                ; r6 = nonyc_r = crv * (*V - 128) + 32768

   ldr r2, [r0, #(4*3)]               ; r2 = -cgv
   mla r4, r2, r12, r7                ; r4 = - cgv * (*V - 128) + 32768

   ldr r3, [r0, #(4*1)]               ; r3 = cbu
   mla r5, r3, r11, r7                ; r5 = nonyc_b = cbu * (*U - 128) + 32768  

   sub r9, r9, #16                    ; r9 = *Y - 16
   mla r7, r8, r9, r6                 ; r7 = (*Y - 16) * multy + nonyc_r

   ldr r0, [r0, #(4*2)]               ; r0 = -cgu
   mla r4, r0, r11, r4                ; r4 = nonyc_g = - cgu * (*U - 128) + r4 = - cgu * (*U - 128) - cgv * (*V - 128) + 32768

   add r0, pc, #(rb_clip-.-8)         ; r0 contains the pointer to the R and B clipping array
   ldrb r7, [r0, r7, asr #(16+3)]     ; r7 = R composant
 
   mla r12, r8, r9, r5                ; r12 = (*Y - 16) * multy + nonyc_b
   mla r1, r8, r9, r4                 ; r1 = (*Y - 16) * multy + nonyc_g

   ldrb r12, [r0, r12, asr #(16+3)]   ; r12 = B composant (and the start of the RGB word)
   add r12, r12, r7, lsl #11          ; r12 = .GB ...

   add r11, pc, #(g_clip-.-8)         ; r11 now contains the pointer to the G clipping array
   ldrb r1, [r11, r1, asr #(16+2)]    ; r1 contains the G part of the RGB triplet
   add r12, r12, r1, lsl #5           ; r12 = RGB ... (ie the first pixel (half-word) is done)

   ; --- next pixel
   ldrb r9, [r10, #-1]                ; r9 = *(Y+1)
   sub r9, r9, #16                    ; r9 = *(Y+1) - 16

   mla r10, r8, r9, r6                ; r10 is the Red part of the RGB triplet
   mla r7, r8, r9, r5                 ; r7 is the Blue part of the RGB triplet
   mla r2, r8, r9, r4                 ; r2 is the Green part of the RGB triplet

   ldrb r2, [r11, r2, asr #(16+2)]    ; r2 = G composant
   add r12, r12, r2, lsl #(5+16)      ; r12 = RGB .G.
   ldrb r7, [r0, r7, asr #(16+3)]     ; r7 = B composant
   add r12, r12, r7, lsl #(0+16)      ; r12 = RGB .GB
   ldrb r10, [r0, r10, asr #(16+3)]   ; r10 = R composant
   add r12, r12, r10, lsl #(11+16)    ; r12 = RGB RGB

   ;---- do store ----
   ldr r3, [sp, #4]                   ; r3 = RGB
   add r3, r3, #4                     ; r3 = RGB++ (ie next double-pixel)
   str r3, [sp, #4]                   ; store the RGB pointer
   str r12, [r3]                      ; store the rgb pixel at *RGB

   ;---- next line ----
   ldr r1, [sp, #32]                  ; r1 = Ynext
   ldrb r9, [r1]                      ; r9 = *Ynext
   sub r9, r9, #16                    ; r9 = *Ynext - 16

   mla r2, r8, r9, r4                 ; r2 is the Green part of the RGB triplet
   mla r7, r8, r9, r5                 ; r7 is the Blue part of the RGB triplet
   mla r10, r8, r9, r6                ; r10 is the Red part of the RGB triplet

   ldrb r12, [r0, r7, asr #(16+3)]    ; r12 = ..B ...
   ldrb r10, [r0, r10, asr #(16+3)]   ; r10 = B composant
   add r12, r12, r10, lsl #11         ; r12 = R.B ...
   ldrb r2, [r11, r2, asr #(16+2)]    ; r2 = G composant
   add r12, r12, r2, lsl #5           ; r12 = RGB ...

   ;---- next pixel
   ldrb r9, [r1, #1]                  ; r9 = *(Ynext+1)
   sub r9, r9, #16                    ; r9 = *(Ynext+1) - 16

   add r1, r1, #2                     ; r1 = Ynext + 2
   str r1, [sp, #32]                  ; store the increased Ynext pointer

   mla r7, r8, r9, r5                 ; r7 is the Blue part of the RGB triplet
   mla r10, r8, r9, r6                ; r10 is the Red part of the RGB triplet
   mla r2, r8, r9, r4                 ; r2 is the Green part of the RGB triplet

   ldrb r7, [r0, r7, asr #(16+3)]     ; r7 = B composant
   add r12, r12, r7, lsl #(16+0)      ; r12 = RGB ..B
   ldrb r10, [r0, r10, asr #(16+3)]   ; r10 = R composant
   add r12, r12, r10, lsl #(16+11)    ; r12 = RGB R.B
   ldrb r2, [r11, r2, asr #(16+2)]    ; r2 = G composant
   add r12, r12, r2, lsl #(16+5)      ; r12 = RGB RGB

   ;---- do store
   ldr r3, [sp, #0]                   ; r3 = RGBnext pointer
   add r3, r3, #4                     ; r3 = next pixel on the RGBnext line
   str r12, [r3, #-4]                 ; store the next pixel
   str r3, [sp, #0]                   ; store the increased 'next line' pixel pointer

   ;-- complete, do loop --
   subs lr, lr, #2                    ; decrement the line counter
   bne yuv_loop                       ; and restart if not at the end of the line

   ldr r0, [sp, #8]                   ; r0 = saved width
   ldr r1, [sp, #0]                   ; r1 = RGBnext pointer
   mov lr, r0                         ; lr = saved width (to restart the line counter)
   str r1, [sp, #4]                   ; current RGBnext pointer is next iteration RGB pointer
   add r1, r1, r0, lsl #1             ; r1 = update RGBnext to next line
   str r1, [sp, #0]                   ; store updated RGBnext pointer

   ldr r3, [sp, #16]                  ; r3 = (2 * stride_Y) - width
   ldr r4, [sp, #28]                  ; r4 = Y ptr
   ldr r5, [sp, #32]                  ; r5 = Ynext ptr
   add r4, r4, r3                     ; r4 = Y ptr for the next two lines
   add r5, r5, r3                     ; r5 = Ynext ptr for the next two lines
   str r4, [sp, #28]                  ; store updated Y pointer
   str r5, [sp, #32]                  ; store update Ynext pointer

   ldr r1, [sp, #20]                  ; r1 = stride_U - (width / 2)
   ldr r2, [sp, #36]                  ; r2 = U ptr

   ldr r6, [sp, #12]                  ; get height counter
 
   add r2, r2, r1                     ; update U ptr
   str r2, [sp, #36]                  ; store updated U ptr (and update 'V' at the same time :-) )

   subs r6, r6, #2
   str r6, [sp, #12]
   bne yuv_loop
 
   ; Exit cleanly :-)
   add sp, sp, #(11*4)             ; remove all custom things from stack
   ldmia   sp!, { r4 - r12, pc }   ; restore callee saved regs and return


const_storage
   ; In order : crv, cbu, - cgu, - cgv, multy
   DCD 0x00019895, 0x00020469, 0xffff9bb5, 0xffff2fe1, 0x00012A15
rb_clip_dummy
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
rb_clip
       DCB 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
       DCB 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
       DCB 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
       DCB 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
       DCB 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
g_clip_dummy
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
g_clip
       DCB 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
       DCB 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
       DCB 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f
       DCB 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
END

2.16.2009

YUV to RGB in ARMv4 assembly

以前 ipaq 3630 familiar 的bbplayer 有一段code作 yuv to rgb..
(http://www.koders.com/noncode/fid9B79A2EAD6C3F6EE8454AB93E5D9F77A1C509D19.aspx?s=mp3)
用 assembly 寫的，因為看到版權宣告是 free 的，所以把全部內容都貼出來：


/*
Copyright (c) 2001 Lionel Ulmer ([email protected] / [email protected])

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

/* WARNING : this function only works when stride_U == stride_V (I use some hacks to
        not have to do too many computations at line's end)...

  C-like prototype :
   void convert_yuv420_rgb565(AVPicture *picture, unsigned char *results, unsigned char *_py,unsigned char *_pu,unsigned char *_pv);

*/

#ifdef __arm__

   .text
 .align
 
   .global convert_yuv420_rgb565
convert_yuv420_rgb565:

   stmdb   sp!, { r4 - r12, lr }   @ all callee saved regs
   ldr r7,[r0,#0]                  @dest width
   ldr r9,[r0,#4]                  @dest height
   ldr r10,[r0, #12]               @ sourcewidth
   ldr r5,  [r0, #16]              @ source_height
   mul r4,r5,r10                   @zll
   mov r4,r4,lsr#2                 @ Vptr - U ptr
   ldr r6,  [r0, #20]              @  rgbstrid
   ldr r8,[r0, #12]                @zll width
   add r8, r8, r2                  @ Y + stride_Y
   add r0,r1,r6                    @ RGB + 1
   stmdb   sp!, { r0-r10 }
   @ Stack description :
   @ (sp+ 0) RGB + one line  r0
   @ (sp+ 4) RGB             r1
   @ (sp+ 8) _py             r2
   @ (sp+12) _pu             r3
   @ (sp+16) _pv - _pu       r4
   @ (sp+20) sourceheight    r5
   @ (sp+24) rgbstrid        r6
   @ (sp+28) destwidth       r7
   @ (sp+32) Ynext           r8
   @ (sp+36) destheight      r9
   @ (sp+40) sourcewidth     r10              

   mov lr,r10                         @ Initialize the width counter
   add r0, pc, #(const_storage-.-8)   @ r0 = base pointer to the constants array
   ldr r8, [r0, #(4*4)]               @ r8 = multy  
yuv_loop:
   add r0, pc, #(const_storage-.-8)   @ r0 = base pointer to the constants array
   ldr r10, [sp, #8]                 @ r10 = Y    ...
   ldr r1, [sp, #12]                  @ r1 = U    ...
   ldrb r9, [r10, #0]                 @ r9 = *Y    ...
   ldrb r11, [r1]                     @ r11 = *U 
   add r1, r1, #1                     @ r1 = U++
   ldr r2, [sp, #16]                  @ r2 = V - U ...
   str r1, [sp, #12]                  @ store U++
   add r2, r2, r1                     @ r2 = V+1
   ldrb r12, [r2, #-1]                @ r12 = *V
   sub r11, r11, #128                 @ r11 = *U - 128
   sub r12, r12, #128                 @ r12 = *V - 128
   ldr r1, [r0, #(4*0)]               @ r1 = crv
   mov r7, #32768                     @ r7 = 32768 (for additions in MLA)
   ldr r2, [r0, #(4*3)]               @ r2 = -cgv
   mla r6, r1, r12, r7                @ r6 = nonyc_r = crv * (*V - 128) + 32768
   ldr r3, [r0, #(4*1)]               @ r3 = cbu
   mla r4, r2, r12, r7                @ r4 = - cgv * (*V - 128) + 32768
   sub r9, r9, #16                    @ r9 = *Y - 16
   mla r5, r3, r11, r7                @ r5 = nonyc_b = cbu * (*U - 128) + 32768  
   ldr r0, [r0, #(4*2)]               @ r0 = -cgu
   mla r7, r8, r9, r6                 @ r7 = (*Y - 16) * multy + nonyc_r
   add r10, r10, #2                   @ r10 = Y + 2
   mla r4, r0, r11, r4                @ r4 = nonyc_g = - cgu * (*U - 128) + r4 = - cgu * (*U - 128) - cgv * (*V - 128) + 32768
   add r0, pc, #(rb_clip-.-8)         @ r0 contains the pointer to the R and B clipping array
   mla r12, r8, r9, r5                @ r12 = (*Y - 16) * multy + nonyc_b
   ldrb r7, [r0, r7, asr #(16+3)]     @ r7 = R composant
   mla r1, r8, r9, r4                 @ r1 = (*Y - 16) * multy + nonyc_g
   ldrb r9, [r10, #-1]                @ r9 = *(Y+1)
   str r10, [sp, #8]                 @ save Y + 2
   ldrb r12, [r0, r12, asr #(16+3)]   @ r12 = B composant (and the start of the RGB word)
   add r11, pc, #(g_clip-.-8)         @ r11 now contains the pointer to the G clipping array
   ldrb r1, [r11, r1, asr #(16+2)]    @ r1 contains the G part of the RGB triplet
   sub r9, r9, #16                    @ r9 = *(Y+1) - 16
   mla r10, r8, r9, r6                @ r10 is the Red part of the RGB triplet
   add r12, r12, r7, lsl #11          @ r12 = .GB ...
   mla r7, r8, r9, r5                 @ r7 is the Blue part of the RGB triplet
   add r12, r12, r1, lsl #5           @ r12 = RGB ... (ie the first pixel (half-word) is done)
   mla r2, r8, r9, r4                 @ r2 is the Green part of the RGB triplet
   ldrb r10, [r0, r10, asr #(16+3)]   @ r10 = R composant
   ldrb r7, [r0, r7, asr #(16+3)]     @ r7 = B composant
   ldr r1, [sp, #32]                  @ r1 = Ynext
   ldrb r2, [r11, r2, asr #(16+2)]    @ r2 = G composant
   ldrb r9, [r1]                      @ r9 = *Ynext
   add r12, r12, r2, lsl #(5+16)      @ r12 = RGB .G.
   sub r9, r9, #16                    @ r9 = *Ynext - 16
   mla r2, r8, r9, r4                 @ r2 is the Green part of the RGB triplet
   add r12, r12, r7, lsl #(0+16)      @ r12 = RGB .GB
   mla r7, r8, r9, r5                 @ r7 is the Blue part of the RGB triplet
   add r12, r12, r10, lsl #(11+16)    @ r12 = RGB RGB
   ldr r3, [sp, #4]                   @ r3 = RGB
   mla r10, r8, r9, r6                @ r10 is the Red part of the RGB triplet
   str r12, [r3]                      @ store the rgb pixel at *RGB
   add r3, r3, #4                     @ r3 = RGB++ (ie next double-pixel)
   str r3, [sp, #4]                   @ store the RGB pointer
   ldrb r9, [r1, #1]                  @ r9 = *(Ynext+1)
   add r1, r1, #2                     @ r1 = Ynext + 2
   sub r9, r9, #16                    @ r9 = *(Ynext+1) - 16
   ldrb r12, [r0, r7, asr #(16+3)]    @ r12 = ..B ...
   ldrb r10, [r0, r10, asr #(16+3)]   @ r10 = B composant
   mla r7, r8, r9, r5                 @ r7 is the Blue part of the RGB triplet
   add r12, r12, r10, lsl #11         @ r12 = R.B ...
   ldrb r2, [r11, r2, asr #(16+2)]    @ r2 = G composant
   mla r10, r8, r9, r6                @ r10 is the Red part of the RGB triplet
   add r12, r12, r2, lsl #5           @ r12 = RGB ...
   mla r2, r8, r9, r4                 @ r2 is the Green part of the RGB triplet
   ldrb r7, [r0, r7, asr #(16+3)]     @ r7 = B composant
   str r1, [sp, #32]                  @ store the increased Ynext pointer
   add r12, r12, r7, lsl #(16+0)      @ r12 = RGB ..B
   ldrb r10, [r0, r10, asr #(16+3)]   @ r10 = R composant
   ldr r3, [sp, #0]                   @ r3 = RGBnext pointer
   add r12, r12, r10, lsl #(16+11)    @ r12 = RGB R.B
   ldrb r2, [r11, r2, asr #(16+2)]    @ r2 = G composant
   add r3, r3, #4                     @ r3 = next pixel on the RGBnext line
   add r12, r12, r2, lsl #(16+5)      @ r12 = RGB RGB
   str r12, [r3, #-4]                 @ store the next pixel
   str r3, [sp, #0]                   @ store the increased 'next line' pixel pointer
   subs lr, lr, #2                    @ decrement the line counter
   bne yuv_loop                       @ and restart if not at the end of the line


   ldr r0, [sp, #40]                  @ r0 = saved sourcewidth  ....


   ldr r1, [sp, #0]                   @ r1 = RGBnext pointer
   ldr r2, [sp, #24]                  @ zll rgbstrid
   mov lr, r0                         @ lr = saved width (to restart the line counter)

   subs r3,r2,r0,lsl#1                @ (rgbstride - 2 width)
   add  r1,r1,r3                      @  the nest two RGBline
   str r1, [sp, #4]                   @ current RGBnext pointer is next iteration RGB pointer
   add r1,r1,r2                       @ r1 = update RGBnext to next line
   str r1, [sp, #0]                   @ store updated RGBnext pointer

   ldr r3, [sp, #40]                  @ sourcewidth
   ldr r4, [sp, #8]                   @ r4 = Y ptr
   ldr r5, [sp, #32]                  @ r5 = Ynext ptr
   add r4, r4, r3                     @ r4 = Y ptr for the next two lines
   add r5, r5, r3                     @ r5 = Ynext ptr for the next two lines
   str r4, [sp, #8]                  @ store updated Y pointer
   str r5, [sp, #32]                  @ store update Ynext pointer


   ldr r6, [sp, #20]                  @ get height counter
   subs r6, r6, #2
   str r6, [sp, #20]
   bne yuv_loop
 
   @ Exit cleanly :-)
   add sp, sp, #(11*4)             @ remove all custom things from stack
   ldmia   sp!, { r4 - r12, pc }   @ restore callee saved regs and return


const_storage:
   @ In order : crv, cbu, - cgu, - cgv, multy
   .word 0x00019895, 0x00020469, 0xffff9bb5, 0xffff2fe1, 0x00012A15
rb_clip_dummy:  
       .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
rb_clip:
       .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
       .byte 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
       .byte 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
       .byte 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
       .byte 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
g_clip_dummy:  
       .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
g_clip:  
       .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
       .byte 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
       .byte 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f
       .byte 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f
       .byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       .byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       .byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       .byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       .byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       .byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f

#endif

CeGCC - Try Inline Assembly

Reference 這一篇：ARM GCC Inline Assembly
寫一段 inline assemly code 試試..

可以reference mpegvideo_armv5te.c

訂閱：文章 (Atom)