9.14.2010
奇怪mcu的I2C奇怪波形
8.10.2010
Build Assembly code in VS2005
大概也跟 eVC 一樣,就是為 assembly.asm 加上 custom build command.
- 先把 assembly code 的 source file 寫好
- 開啟 VS2005,把 assembly file 加到 source folder 中
- 在 assembly file 按右鍵,選 properties
- Custom Build Step -- General --Command Line,, 寫
armasm.exe -cpu xscale "$(InputPath)" "$(IntDir)/$(InputName).obj"
- Output 寫
$(IntDir)/$(InputName).obj
8.09.2010
CpuId - 紀錄一下
這就是 copy 自 tcpmp getcpuid 的 code 後,run 起來:
- 我們的: 4117B362,1D152152,0,0
- 測試的: 410FB764,1D992992,0,0
4117B362:ref(http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0360f/CACEDHJG.html)
所以是 ARM1136, rev2
410FB764: ref (http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0360f/CACEDHJG.html)
所以是 ARM1176 , rev 4
第二個,cache type register
ref : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0211k/I67616.html
- [31:29] : always 0
- [28:25] : ctype - cache type b1110 means "write back","register 7 operation","format C"
- [24] : separate D and I cache
- [23:12] : D cache size
- [11:0] : I cache size
- [11] : P
- [10:9] : always 0
- [8:6] : size
- [5:3] : association (way)
- [2] : M
- [1:0] Length
1D152152 : 16K, 4 way, len=32 bytes.
1D992992: 32K, 4 way, len=32 bytes.
driver code
GETID.C:
#include <windows.h>
#include <drvlib.h>
//----- Code Control -----
#define GID_DEBUG
#ifdef DEBUG
DBGPARAM dpCurSettings = { TEXT("Getid"), {
TEXT("Verbose"), TEXT(""), TEXT(""), TEXT(""),
TEXT(""), TEXT(""), TEXT(""), TEXT(""),
TEXT(""), TEXT(""), TEXT(""), TEXT(""),
TEXT(""), TEXT(""), TEXT(""), TEXT("")},
0x00000000
};
#endif
extern void GetCpuId(int,DWORD*);
void SafeGetCpuId(int Id, DWORD* p)
{
memset(p,0,4*sizeof(DWORD));
GetCpuId(Id,p);
}
//--- GID_Init
DWORD GID_Init(DWORD dwContext)
{
DWORD CpuId[4];
DWORD dwErr;
TCHAR outmsg[60];
RETAILMSG(1, (TEXT("[GID] GID_Init+\r\n")));
SafeGetCpuId(0,CpuId);
RETAILMSG(1,(TEXT("%X,%X,%X,%X\r\n"),CpuId[0],CpuId[1],CpuId[2],CpuId[3]));
swprintf(outmsg,TEXT("%X,%X,%X,%X"),CpuId[0],CpuId[1],CpuId[2],CpuId[3]);
MessageBox(NULL,outmsg,TEXT("GID"),MB_OK);
RETAILMSG(1, (TEXT("[GID] GID_Init OK \r\n")));
return TRUE;
}
//--- GID_Deinit
BOOL GID_Deinit(DWORD dwContext)
{
RETAILMSG(1, (TEXT("[GID] GID_Deinit+\r\n")));
return TRUE;
}
//--- GID_Open
DWORD GID_Open(DWORD dwData, DWORD dwAccess, DWORD dwShareMode)
{
RETAILMSG(1, (TEXT("[GID] GID_Open+\r\n")));
return dwData;
}
//--- GID_Close
BOOL GID_Close(DWORD Handle)
{
RETAILMSG(1, (TEXT("[GID] GID_Close+\r\n")));
return TRUE;
}
//--- GID_Read
DWORD GID_Read(DWORD Handle, LPVOID pBuffer, DWORD dwNumBytes)
{
RETAILMSG(1, (TEXT("[GID] GID_Read+\r\n")));
return 0;
}
//--- GID_Write
DWORD GID_Write(DWORD Handle, LPCVOID pBuffer, DWORD dwNumBytes)
{
RETAILMSG(1, (TEXT("[GID] GID_Write+\r\n")));
return 0;
}
//--- GID_Seek
DWORD GID_Seek(DWORD Handle, long lDistance, DWORD dwMoveMethod)
{
RETAILMSG(1, (TEXT("[GID] GID_Seek+\r\n")));
return (DWORD) -1;
}
//--- GID_IOControl
BOOL GID_IOControl(
DWORD Handle,
DWORD dwCode,
PBYTE pBufIn,
DWORD dwLenIn,
PBYTE pBufOut,
DWORD dwLenOut,
PDWORD pdwActualOut
)
{
RETAILMSG(1, (TEXT("[GID] GID_IOControl+\r\n")));
return (FALSE);
}
BOOL
GID_DllEntry(
HINSTANCE hinstDll, /*@parm Instance pointer. */
DWORD dwReason, /*@parm Reason routine is called. */
LPVOID lpReserved /*@parm system parameter. */
)
{
if (dwReason == DLL_PROCESS_ATTACH)
{
DisableThreadLibraryCalls((HMODULE) hinstDll);
}
return (TRUE);
}
arm.s
INCLUDE kxarm.h
EXPORT GetCpuId
TEXTAREA
LEAF_ENTRY GetCpuId
export GetCpuId
mrs r0,cpsr
and r0,r0,#15
cmp r0,#15
bne UserMode
mrc p15,0,r0,c0,c0,0
nop
nop
mrc p15,0,r2,c0,c0,1
nop
nop
str r0,[r1,#0]
str r2,[r1,#4]
UserMode
mov pc,lr
ref (http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0360f/CACEDHJG.html) CpuID 取的是 opcode2 = 0 和 1 : cpu id 跟 cache type.
GETID.DEF
LIBRARY getid
EXPORTS
GID_Init
GID_Deinit
GID_Open
GID_Close
GID_Read
GID_Write
GID_IOControl
sources
TARGETNAME=getid
TARGETTYPE=DYNLINK
RELEASETYPE=PLATFORM
DLLENTRY=GID_DllEntry
DEFFILE=$(_TARGETPLATROOT)\SRC\DRIVERS\getid\getid.def
PREPROCESSDEFFILE=1
SYNCHRONIZE_DRAIN=1
WINCEREL=1
TARGETLIBS= \
$(_COMMONSDKROOT)\LIB\$(_CPUINDPATH)\coredll.lib \
SOURCELIBS= \
SOURCES= \
getid.c \
arm.s
load driver 的程式就是以前哪個 activedriverex( ). 改一下 dll name 就可以
ref http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0211k/I67616.html
用
c0, Core feature ID registers
好像可以得到更多 cpu feature
8.05.2010
pipeline bubble
還要注意 pipline 的特性。
pipeline 將指令分成幾個 stage:
- fetch,
- decode/register read,
- alu,
- memory write,
- register update.
ldr r0,#1
add r3,r0,r1
第二行指令在 decode/register read 時,r0 還沒update,因為上一行才執行到alu operation。所以在pipeline中的第二行指令就要停下來,等兩個clock,等上一行指令執行到register update後,才可以繼續。這樣,就白白浪費 2 個 clock 了。
為了避免這樣的情況,寫 assembly 的時候,就可以在中間插入下面要作的 assembly code,不要白白浪費這兩個clock
在 (很久)前面的文章:yuv - rgb color space convert 的 assembly code 可以看到。最後一個作者的assembly code 就有做到這個(所以整個 code 的 flow 變得不好trace)。實測的結果,也是最快的。
在 Wiki 里也有說明:
- instruction scheduling
- Instruction scheduling is an important optimization for modern pipelined processors, which avoids stalls or bubbles in the pipeline by clustering instructions with no dependencies together, while being careful to preserve the original semantics.
8.04.2010
cache
當 cpu 要讀取 memory 時, cache 先把 memory 的資料讀進 cache 中,然後再讓 cpu 讀取。
cpu 每次讀取的單位是 int (32bit),但是 cache 為求效能,會一次讀入比 int 還多的資料。
cache 每次讀入的資料大小叫 line size,每次讀入的資料放入 一個 line。
cache 就是利用這樣預先讀取的能力,讓cpu 下次需要讀取記憶體時,可以直接由 cache 拿,不需要再向 主記憶體 讀取。
由於 cache 只有小小一塊,所以必須要有一個機制知道 cpu 需要存取記憶體時,那個 address的內容是不是已經在cache 里,如果在,是在 cache 的那一條 line ...
最直覺的方法就是去找每一個 cache line 的 address,看看那一個 line 所以 cache 的 記憶體位址是不是 cpu 現在要的。
這樣個 cache 叫 associative cache.
但是這樣作很耗空間(電路),因為每一個 line 都要作一個 address比較電路。
所以就有比較便宜的作法.. 把 把 address 分成兩部份:
tag, index
以 32 bit 的 address line 為例,可以分成:
- tag : 31~20
- index : 19~0
後來又有折衷的辦法,就是...每個 index 可以有兩個 tag欄位和lines,這樣,每20 條位址線,就可以共用兩條 cache line。 -- 這就叫 2 way cache.
=>這樣就是:拿 index 找到對應的 2 個 tag ,比較和 address bit 31-20 一步一樣。
一此類推,可以有 4 way cache。
這一篇 (http://www.mirabilisdesign.com/Pages/Demonstrations/systemarchitecture/Cache_System/Functional_Cache_Model_Overview.html)雖然是 sim software 的說明,但是可以看到 ARM11 cache 的 impelement 圖例
cache line size 是 16 words (address 0~3), index size 是 4096 (address 4~15), tag filed 是 address 16~31,
所以 cache index size x byte size x way = cache size
7.19.2010
VFP in CE 6.0
這一篇 google groupe討論 說
The VS2005 compiler used by PB for CE6.0 emits no VFP instructions and
unless you implement FPCRT.DLL for an ARM then all CE implementations
are integer (default MS implementation of FPCRT uses software
emulation), even if you have a hardware VFP in your core (e.g. ARM11
or Cortex-A8). To actually use the VFP you must do the following:
1. Use SYSGEN_OEM_FPCRT - catalog item "OEM Floating Point CRT (ARM
only)
2. Download the VFPv2 library support from the ARM website and
incorporate it into your build:
www.arm.com/products/os/windowsce_vfp_dl.html
Instructions are included with the download from ARM.
My tests show a 4x increase in performance on floating point
operations on a release build on an ARM1136JF-S core.
Andrew.
也就是說,CE 6 的 VFP driver(?) library f是由 ARM 提供的。
還有這一篇 MSDN 討論,說
The current version of Windows CE supports the ARMV4I architecture. Cortex A8 it's ARMv7 (ARM's naming can be even more confusing than ms one...) and currently the compiler does not use its specific features (NEON, VFP3 etc.).
The new release of Windows CE (named Windows Embedded Compact 7) will support the ARMv7 architecture.
Here you can find some information about the new features of this release:
http://www.microsoft.com/windowsembedded/en-us/products/windowsce/compact7.mspx
You can download a public beta from connect.microsoft.com.
If your project has not a very short time frame (you plan to release your device Q2 next year, for example), you may consider using this new release for your development to be able to "unleash" all the power of your CPU core.
Valter Minute
Windows Embedded MVP
http://geekswithblogs.net/WindowsEmbeddedCookbook
所以你藥用 CE 6 寫 VFP 或是 NEON 的 code 是不可能的,要等 Windows Embedded 7.
總合起來,可以要看一下 msdn :
Including a Replacement Floating Point C Run-Time Library in a Run-Time Image
還有這一篇的實做 copy 下來,免得不見:
Download and install the "ARM® VFPv2 Floating Point Support Library" from ARM.
If building with Platform Builder to build your OS image
simply add the FPCRT project to your solution, and set the following catalog feature
EVM_3530\Core OS\CEBASE\Applications and Services Development\C Libraries and Runtimes\OEM Floating Point CRT (ARM only)
A SYSGEN is required.
If building without Platform Builder, ensure the SYSGEN variable is set. I added the following to my "tinykernel DEBUG.bat" file.
set SYSGEN_OEM_FPCRT=1
Add entries to platform.bib
fpcrt.dll $(_FLATRELEASEDIR)\fpcrt.dll NK SH
k.fpcrt.dll $(_FLATRELEASEDIR)\fpcrt.dll NK SHMK
I copied the contents of %WINCEROOT%\OTHERS\ARM\VFPv2\src\ARMVFPv2\obj to my prebuilt directory.
Then copied the DLL project to my %WINCEROOT%\OTHERS\ARM\VFPv2\src\FPCRT to my %PLATFORMROOT%\%_TGTPLAT%\src\drivers directory.
Modified dirs, and updated sources to point to new library location.
A SYSGEN is required.
Visual Studio 2008 VFP Support
VS2008 ARM compiler supports emulated and hardware floating point. To enable hardware based Vector Floating point you need to use this option/QRfpe-
This means not floating point emulation.
If you use
/QRfpe
it will generate software floating point, as it means floating point emulation.
Microsoft KB 947894
Note: This is not related to NEON support
有關 multimedia (video/audio/graphic) 部份,ARM 有提供 OpenMax 的實做 (實際用 ARM11, Cortex A8 feature optimize) 的 library
Khronos Standards
...但是依照慣例,ARM的網頁連結 半年後一定又會改 >_<
要註冊才能download..
喔?是以 library 的方式提供,用 realview compile (當然不是 VS...)
6.04.2010
CPU 與週邊 的速度
所以當 CPU 對 GPIO register 寫資料時,是同步在 60MHz...
ldr r0, =GPIO1_PAD_EN
mov r1, #0x00100000 ; pin 20
str r1,[r0]
ldr r0, =GPIO1_CTRL20
mov r1,#0x120
mov r2,#0x160
1
str r1,[r0]
nop
str r2,[r0]
nop
b %b1
這一段 code,有沒有加 nop 輸出的方波都是 10MHz。
但是把 nop 改為
str r1,[r0]
str r1,[r0]
str r2,[r0]
str r2,[r0]
b %b1
這樣改完,輸出方波變成 5MHz。
也就是說..如果一直 polling 週邊得的話,就等於是用 60MHz 在跑....
11.24.2009
ARM cp15 c1,c0, 0 - control register configuration
來溝通 (read/write)
mrc/mcr p15,0,,Cn,Cm,N
參考: cp15 instruction
比較常用的:Control Register Configuration:
Control Register 內容參考:c1, Control Register
MRC p15, 0,, c1, c0, 0 Read Control Register configuration data
MCR p15, 0,, c1, c0, 0 Write Control Register configuration data
-- copy 過來,免得以後改位置--
Table 3.24. Control Register bit functions
Bits | Field | Function |
---|---|---|
[31] | SBZ | This field returns a Unpredictable value when read. Should Be Zero. |
[30] | TE | Determines the state that the processor enters exceptions: 0 = Exceptions entered in ARM state 1 = Exceptions entered in Thumb state. |
[29] | SBZ | Should Be Zero. This bit reads as 0 and ignores writes. |
[28] | SBZ | Should Be Zero. This bit reads as 0 and ignores writes. |
[27] | NMI | Determines the state of the non-maskable bit that is set by a configuration pin FIQISNMI: 0 = The processor is backwards compatible and behaves as normal 1 = All attempts to modify the CPSR F bit can only clear it. There is no way to set it in software. The SPSRs remain freely modifiable but copying the SPSR to CPSR can only clear the F bit. FIQs continue to set the F bit automatically. NoteThe status of the FIQISNMI pin is read by Bit 27. Software cannot write to Bit 27. |
[26] | SBZ | Should Be Zero. This bit reads as 0 and ignores writes. |
[25] | EE | Determines how the E bit in the CPSR bit is set on an exception: 0 = CPSR E bit is set to 0 on an exception 1 = CPSR E bit is set to 1 on an exception. The reset value depends on external signals, see Table 3.25. |
[24] | VE | Enables the VIC interface to determine interrupt vectors: 0 = Interrupt vectors are fixed 1 = Interrupt vectors are defined by the VIC interface. See the description of the V bit, bit 13. |
[23] | SBZ | Should Be Zero. This bit reads as 0 and ignores writes. |
[22] | U | Enables unaligned data access operations for mixed little-endian and big-endian operation: 0 = Unaligned data access support disabled 1 = Unaligned data access support enabled. The A bit has priority over the U bit. The reset value of the U bit depends on external signals, see Table 3.25. |
[21] | FI | Configures low latency features for fast interrupts. 0 = All performance features enabled. 1 = Low interrupt latency configuration enabled. |
[20] | SBZ | Should Be Zero. This bit reads as 0 and ignores writes. |
[19] | SBZ | Should Be Zero. This bit reads as 0 and ignores writes. |
[18] | SBO | Should Be One. This bit reads as 1 and ignore writes. |
[17] | SBZ | Should Be Zero. This bit reads as 0 and ignores writes. |
[16] | SBO | Should Be One. This bit reads as 1 and ignore writes. |
[15] | L4 | Determines if the T bit is set for PC load instructions: 0 = Loads to PC set the T bit. 1 = Loads to PC do not set the T bit, ARMv4 behavior. For more details, see the ARM Architecture Reference Manual. |
[14] | RR | Determines the replacement strategy for the cache: 0 = Normal replacement strategy by random replacement 1 = Predictable replacement strategy by round-robin replacement. |
[13] | V | Determines the location of exception vectors: 0 = Normal exception vectors selected, address range = 1 = High exception vectors selected, address range = |
[12] | I | Enable or disable level one instruction cache: 0 = disabled 1 = enabled. |
[11] | Z | Enables programme flow prediction: 0 = Program flow prediction disabled 1 = Program flow prediction enabled. |
[10:8] | SBZ | Should Be Zero. This bit reads as 0 and ignores writes. |
[7] | B | Determines operation as little-endian or big-endian memory system and the names of the low four-byte addresses within a 32-bit word: 0 = Little-endian memory system 1 = Big-endian word-invariant memory system. The reset value of the B bit depends on external signals, see Table 3.25. |
[6:3] | SBO | Should Be One. This field read as 1 and ignore writes. |
[2] | C | Enables or disables level one data cache: 0 = Data cache disabled 1 = Data cache enabled. |
[1] | A | Enables strict alignment of data to detect alignment faults in data accesses: 0 = Strict alignment fault checking disabled. 1 = Strict alignment fault checking enabled. The A bit setting takes priority over the U bit. |
[0] | M | Enables or disables the MPU: 0 = MPU disabled 1 = MPU enabled. |
6.10.2009
iMX31 PDK build Image and upload
iMX31 3 stack board 又叫做 PDK (platform Developement Kit)。
serach IMX31
- IMX31_SDK_14 : IMX31_SDK_WINCE5_BSP.msi
安裝一個platform : C:\WINCE500\PLATFORM\3ds
Platform 是 C:\WINCE500\PBWorkspaces\3dsmobility
依照 IMX31_PDK14_UG.pdf 的說明,install, build image:
- 因為 有上 CE QFE,所以要手動 build ms sd driver
- 手動build csp
- sysgen
- 手動 build. 這兩個 folder 是不會自動 build 的。
燒錄 XLDR.NB0
- 調整 PDK debug board 的 SW (紅色),由左到右,依次是:0.0.0.0.0.0 (download mode)
- 接 RS232 到 debug board,開機
- 啟動 ATK - 選 iMX31 TOC02, DDR, Go
- 第二頁選 Flash Tool,Next
- NAND - K9K2G08U0A,
- Erase
- Program - XLDR.NB0 (start address : 0x0000)
- Program - EBOOT.NB0 (start addrss : 0x20000)
6.01.2009
RealView ICE and gdb
手動安裝 RealView ICE for Linux (其實是 redhat)。
裝完後,會在 /opt/ARM/RVI
執行 /opt/ARM/RVI/RVI/.../linux-pentium/rvconfig 會出現跟 windows 一樣的GUI。
follow windows 的作法,identify 出 JTAG scanchain 上的 cpu.
save : rvi.rvc - copy 到 home 比較方便
執行 /opt/ARM/RVI/RVD/Tools/..../rvigdbconfig -f rvi.rvc
reading file rvi.rvc192.168.144.185 就是 RealView ICE 的 ip
Done
connecting to 192.168.144.185
found 192.168.144.185
connected
finished
這樣就可以啟動 arm-gdb 了 (用 CodeSourcer) : arm-none-linux-gnueabi-gdb
(gdb) target remote 192.168.144.185:4000
Remote debugging using 192.168.144.185:4000
0x00000000 in ??()
(gdb)
5.26.2009
Worklog : iMX31 Demoboard, buid WINCE5.0
- Nand : K9F2G08R0A - 256MB
- DDR 128MB
- SW5 : 0
- SW6: 1
- SW7: 0
- SW8: 0
- SW9: 0
- SW10: 0
Demoboard 使用 iMX31 和 MC13783 (PMIC with Audio AMP)。
MC13783 的 size 幾乎和 iMX31 一樣大
CSP 放在 C:\WINCE500\PUBLIC\COMMON\OAK\CSP\FREESCALE
- MX31 : MX31 相關
- MXARM11 : 所有MX 系列中 arm11 相關
依照順序來..
Build CSP:
因為 CSP 不會 auto build (是因為在 public 的關係 ?),所以要手動 build。
到 file,favorite,CSP 按右鍵,check "clean before build" then do "build current project"
大概就是到 public\common\oak\csp\freescale 下面作 build -cbuild 好的 lib 會在 C:\WINCE500\PUBLIC\COMMON\OAK\LIB\ARMV4I\DEBUG
接下來 build BSP (for the 1st time)
build OS menu :
- clean before building : X
- Copy Files to Release Directory After build : V
- Make Run-Time Image After Build : V
要 Clean build 整個 BSP (CSP, PQOAL, BSP) 的話:
- build CSP - follow 前面的步驟,按右鍵...
- build PQOAL
- Clean Before Building : V
- Copy Files to Release Directory : V
- Make Image After Build : X
- 執行 Build OS - sysgen
- build BSP
- Clean Before Building : V
- Copy Files to Release Directory : V
- Make Image After Build : V
- 執行 Build OS - Build and sysgen current BSP
如果只有某部份的 code 修改,可以只build 那部份的 code,然後 執行 Build and sysgen current BSP
到 Files :
- CSP 有修改 : 到修改的 folder 按右鍵- check "Clean before Build" 然後 "Build and sysgen current BSP"
- PQOAL : 一樣,到 PQOAL folre 按右鍵
- Clean before Build : X
- Copy Files to Release Directory After Build : V
- Make Run time image after Build : V
- 執行 : Build and Sysgen Current BSP
5.13.2009
openmoko - flash kernel and rootfs
Debian 直接用 apt 安裝就可以。
通常需要
- kernel - uImage
- rootfs
rootfs 通常是有 jffs2 在檔名中,kernel 通常就有 uImage 在檔名中。
開機進入 nor boot (nand boot) ? 就可以接上 usb ,和 dfu-util 溝通。
按著hold key 開機,進入nor boot 畫面,插上 usb。dmesg..
所以應該是有偵測到...
[786627.436013] usb 2-3: new full speed USB device using ohci_hcd and address 2
[786627.654996] usb 2-3: configuration #1 chosen from 2 choices
[786627.677190] usb 2-3: New USB device found, idVendor=1d50, idProduct=5119
[786627.677198] usb 2-3: New USB device strings: Mfr=1, Product=2, SerialNumber=3
[786627.677202] usb 2-3: Product: Neo1973 Bootloader U-Boot 1.3.2-moko12
[786627.677205] usb 2-3: Manufacturer: OpenMoko, Inc
[786627.677207] usb 2-3: SerialNumber: 0000000
[786628.281641] cdc_acm: This device cannot do calls on its own. It is no modem.
[786628.281641] cdc_acm 2-3:1.0: ttyACM0: USB ACM device
[786628.288104] usbcore: registered new interface driver cdc_acm
[786628.288112] cdc_acm: v0.26:USB Abstract Control Model driver for USB modems and ISDN adapters
燒錄過程要注意的是...沒有操作30 sec 後,機器會自動 power off。
所以看到螢幕黑掉,就要重新 power on.
rootfs download 比較久。
重開機很花時間,約需要 2 min,約 1min 後才會出現splash screen.然後滅掉..
openmoko new freerunner - boot into nand and nor
所以可以讓我玩一下..
先follow instruction,檢查一下nor boot 可不可以boot:
- .按左上角hold button不放
- 按右下角power button開機button不放,, 開機
- 放掉左上角hold button
- 放到右下角power button
接著檢查 nand boot 可不可以 boot:
- 按著右下角 power button 不放
- 馬上按左上角 hold button,維持 5~8 sec,,開機
- 都可以放開了
4.22.2009
build new rev openocd in msys
follow 這一篇..安裝 msys, minGW 環境。
msys - 先安裝 1.0.10
msys-DTK : 是build 需要的一些 tool,像 perl, autoconf, automake... etc
然後 download msys-core-1.0.11,解開後,覆蓋 msys的安裝目錄。
msys 的 update 就是 untar , overwrite ?之後安裝新版 autoconf, automake, libtool。
安裝方式都是 download
http://ftp.gnu.org/gnu/autoconf/autoconf-2.61.tar.bz2
http://ftp.gnu.org/gnu/automake/automake-1.10.tar.bz2
http://ftp.gnu.org/gnu/libtool/libtool-1.5.24.tar.gz
這些tool 會install 在 /usr/local/bin,和原先的 tool 位置 (/bin) 不同。OK !
可以到 openocd 下作 bootstrap,configure and make.
因為已經是在 mingw 下作,所以 configure 時不用加 CC="gcc -mno-cygwin"
只需要加 -enable-ft2232_libftdi
- configure 發生exception ,說找不到 linusb0.dll ,是因為沒有install libusb-win32 driver 的關係,download 那個有 "filter" 字樣的 libusb-win32-exe 下來 安裝就可以。
- bootstrap 發生 Can't locate object method "path" via package "Autom4te::Request" at /usr/bin/autom4te line 81.
說明是說把 autom4te.cache 刪掉就可以。
的確是這樣,刪掉之後,這個 error 就沒了,但是出現其他的 error。
其實這是 autom4te 版本不一致的關係,安裝過新版的 automake 後就 OK 了 (就不會出現這個 error).
build openocd under mingw - with ftd2232 chip interface
openocd 在 mingw 下 build..
先要 install mingw(gcc for win) 和 msys(unix shell for win)
因為安裝 msys 會尋問 mingw在哪,所以要先install mingw:
mingw 現在以經有像 cygwin那樣的網路安裝了,download MinGW-5.1.4.exe
http://sourceforge.net/project/showfiles.php?group_id=2435&package_id=240780
執行後就會出現問題,選安裝,然後而外的package 選 g++ 和 make..
之後就會 自己 download 和安裝完畢
我裝在 C:\MinGW裝完後要手動把 c:\mingw\bin 加到 cmd 的環境變數。
這樣裝完後,就可以開啟cmd.exe,run gcc 了
但是要run make 和 bash file需要 msys。 download msys-1.0.10.exe
http://downloads.sourceforge.net/mingw/MSYS-1.0.10.exe
直接執行就可以,中途會問你postinstall,回 y。
然後問你有沒有 MinGW,回答有,並且把剛剛install MinGW的 path 寫出來
!!但是因為在 unix 中,目錄的左右斜線是相反的,要注意。
裝完後就可以在 programfile - MinGW- msys 啟動 msys 環境。
這個openocd 用 ftd232 (usb-232).
所以要 libftdi.a
libftdi.a 會用到 libusb.a
因為是 for windows 版,所以要dowload windows 版 libusb - libusb-win32-device-bin-0.1.12.1.tar.gz
http://libusb-win32.sourceforge.net/
下載的是bin 檔 (prebuild),所以把 usb.h copy 到 /mingw/include 把 libusb.a copy 到 /mingw/lib
還要安裝 libusb-win32 的 dll : libusb-win32-filter-bin-0.1.12.1.exe
http://sourceforge.net/project/downloading.php?group_id=78138&filename=libusb-win32-filter-bin-0.1.12.1.exe&a=96392657
這樣就可以 開始 build libftdi.a - download libftdi-0.15.tar.gz
http://www.intra2net.com/en/developer/libftdi/download.php
解開,到libftdi-0.15 下, /.configure 然後 make
會出現 error,但是 check 一下 error message,會發現 libftdi.a build 完,是 build example 時有 error,所以不管。
把 src/ftdi.h copy 到 /mingw/include
把 src/.libs/libftdi.a copy 到 /mingw/lib
最後就要 build openocd 了..
download trunk 的版本 (release 版支援的 cpu 比較少..)
解開, run ./bootstrap
./configure -enable-ft2232_libftdi CC="gcc -mno-cygwin"
make
2.18.2009
AVPicture structure 的內容 - 還是 convert_yuv420_rgb565.S
tyedef struct AVPicture {
uint8_t *data[4];
int linesize[4];
} AVPicture;
是給說明一個 frame 的資料內容。
data 是 channel data.
linesize 是channel 的 linesize (寬度).
所以最多有 4 個 channel 資料 (? 哪一種 format 會有四個 channel ?).
以 YUV420 為例,320 x 240 的圖檔,有 Y.U.V 三個 channel,Y 的 linesize 是 320。U, V 的 linesize 都是 160.
所以用到 data[0]~data[3],linesize[0]~linesize[3]。
以 bgr565為例,一個 pixel 一個 word:bgr565,所以沒有分開 channel。所以只有用到 data[0] 和 linesize[0]。
所以 convert _yuv420_rgb565.S 的一開始的 argument 處裡:
就是分別取出
ldr r7, [r0, #0] ; Y ptr
ldr r9, [r0, #4] ; U ptr
ldr r10, [r0, #8] ; V ptr
subs r10, r10, r9 ; V ptr - U ptr
ldr r8, [r0, #12]
add r8, r8, r7 ; Y + stride_Y
ldr r4, [r0, #12] ; Stride_Y
mov r4, r4, lsl #1
sub r4, r4, r2 ; (2 * Stride_Y) - width
ldr r5, [r0, #16] ; Stride_U
sub r5, r5, r2, lsr #1 ; Stride_U - (width / 2)
ldr r6, [r0, #20] ; Stride_V
sub r6, r6, r2, lsr #1 ; Stride_V - (width / 2)
add r0, r1, r2, lsl #1 ; RGB + 1
stmdb sp!, { r0-r10 }
; Stack description :
; (sp+ 0) RGB + one line
; (sp+ 4) RGB
; (sp+ 8) width (save)
; (sp+12) height
; (sp+16) (2 * stride_Y) - width
; (sp+20) stride_U - (width / 2)
; (sp+24) stride_V - (width / 2) !!! UNUSED !!!
; (sp+28) Y ptr
; (sp+32) Y ptr + one line
; (sp+36) U ptr
; (sp+40) V - U
data[0] : Y, data[1] : U, data[2] : V
和
linesize[0] : stride_Y, linesize[1] : stride_U, linesize[2] : stride[V].
那
convert_yuv420_rgb565(AVPicture *pic,unsigned char *out,int width,int height)的參數 width, 和 Stride_Y 有什麼不一樣?
Stride_Y 是 source 影像的寬度。
width 是你要顯示的寬度 (clip)。
有一張 1024x768 的圖,你可以只顯示 800x768 (左邊部份)。
所以不一樣.
從 assembly 看,AVPicture 的 structure 好像不是這樣,他用的好像是:
struct AVPicture{實際上...也是用這個 structure Run 起來的...
char *data[3];
int linesize[3];
} AVPicture;
2.17.2009
Optimize - learned from yuv420_rgb565
- optimze with speed
- 盡量使用 mla ( a = (b*c)+d))
- 盡量不使用 condition jump
- 反組譯
- exam asm - 盡量使用所有 register
- 打亂 instruction. 混合 register only arithematic 和 data load/store
- 先照 assembly algorithm 寫出 c code. - cpu % , 1000 loop time.
- 反組譯, 寫成 .S link - 確認 work
- follow yuv420_rgb565.s 略為修改,但不作 scramble - cpu %, 1000 loop time.
- 作 scamble, 混合 load/store 和 register only instruction. - cpu %, 1000 loop time
Code Reading - 上一篇的YUV - RGB assembly
dummy 都是0,然後兩個加起來, rb 是 16 x 8, g 是16 x 16。
這個大概是跟 paper 講的一樣,用 table 取代 saturation 判斷。
0 的部份就是 負 值的部份。
所以 整個 assembly code 中沒有 compare jmp 指令 (除了最後的 line end 判斷)。
YUV 計算部份直接用 mla ( x +),沒有使用 table -- 大概是因為 ARM 作 16x16 只要1 個 clock,所以沒必要。
一樣是作 w (寬度),然後 line.. 一次(loop) 4 pixel。
因為先作 bit shift (5-6-5)再做查表,所以table[]不用太大。
code 沒有避免使用 mul (mla),反而大量使用,避免不需要的 ldr 動作 (大概是 ldr 和 mla 都是一個 clk 吧)。其中:
- r8 : multy 0x00012A15
- r9 : (Y-16)
- r6 : Coef *(V-128) + 32768
- r5 : Coef*(U-128) + 32768
- r4 : -Coef *(U-128) - Coef*(V-128) + 32768
原來 eVC 的 ARMASM 最佳 example code 就是 OS bsp 下所有的 .s file。
所以語法參考 BSP 就可以了。
MSDN : CE .NET 4.2 ASM
大概要改的是:
- MS armasm 規定只有 label 可以從一行的第一格開始,所以所有其他的 instruction, directive 都要先空
- comment 是以 ; 開頭
- label 不可以加 : 號
- .byte 改為 DCB
- .word 改為 DCD
- .text 宣告要改為 AREA |.text|,CODE, ARM
- .global 要改 EXPORT
http://checko.blogspot.com/2006/09/writting-arm-assembly-in-embedded-vc_28.html :
- 手動加入 yuv420_rgb565.s
- project setting - yuv420_rgb565.s - custom build : 填入 armasm ... (Debug, Relese 都要加)
- 引用的 .cpp 加入:
extern "c" void convert_yuv420_rgb565(char *,char*,int,int);
這個 assembly code 有"適當安排過". 可以看到:
有 memory access 的 instruction,接著的會是 register-only 的 operation。
這樣instruction pipeline 就可以沒有阻礙的run 下去 (如果一個 load 下一個 store, pipe line 要等 cache/memory 的同步?)
作法是:要使用之前,在 n 個 instruction 前就 load...
所以下面是重新安排後的 code,變得比較容易看....
;
; void convert_yuv420_rgb565(AVPicture *picture, unsigned char *results, int w, int h) ;
;
AREA |.text|,CODE,ARM
EXPORT convert_yuv420_rgb565
convert_yuv420_rgb565
stmdb sp!, { r4 - r12, lr } ; all callee saved regs
ldr r7, [r0, #0] ; Y ptr
ldr r9, [r0, #4] ; U ptr
ldr r10, [r0, #8] ; V ptr
subs r10, r10, r9 ; V ptr - U ptr
ldr r8, [r0, #12]
add r8, r8, r7 ; Y + stride_Y
ldr r4, [r0, #12] ; Stride_Y
mov r4, r4, lsl #1
sub r4, r4, r2 ; (2 * Stride_Y) - width
ldr r5, [r0, #16] ; Stride_U
sub r5, r5, r2, lsr #1 ; Stride_U - (width / 2)
ldr r6, [r0, #20] ; Stride_V
sub r6, r6, r2, lsr #1 ; Stride_V - (width / 2)
add r0, r1, r2, lsl #1 ; RGB + 1
stmdb sp!, { r0-r10 }
; Stack description :
; (sp+ 0) RGB + one line
; (sp+ 4) RGB
; (sp+ 8) width (save)
; (sp+12) height
; (sp+16) (2 * stride_Y) - width
; (sp+20) stride_U - (width / 2)
; (sp+24) stride_V - (width / 2) !!! UNUSED !!!
; (sp+28) Y ptr
; (sp+32) Y ptr + one line
; (sp+36) U ptr
; (sp+40) V - U
mov lr, r2 ; Initialize the width counter
add r0, pc, #(const_storage-.-8) ; r0 = base pointer to the constants array
ldr r8, [r0, #(4*4)] ; r8 = multy
yuv_loop
ldr r10, [sp, #28] ; r10 = Y
ldrb r9, [r10, #0] ; r9 = *Y
add r10, r10, #2 ; r10 = Y + 2
str r10, [sp, #28] ; save Y + 2
ldr r1, [sp, #36] ; r1 = U
ldrb r11, [r1] ; r11 = *U
add r1, r1, #1 ;; r1 = U++
str r1, [sp, #36] ; store U++
ldr r2, [sp, #40] ; r2 = V - U
add r2, r1, r2 ; r2 = V+1
ldrb r12, [r2, #-1] ; r12 = *V
sub r11, r11, #128 ; r11 = *U - 128
sub r12, r12, #128 ; r12 = *V - 128
mov r7, #32768 ; r7 = 32768 (for additions in MLA)
add r0, pc, #(const_storage-.-8) ; r0 = base pointer to the constants array
ldr r1, [r0, #(4*0)] ; r1 = crv
mla r6, r1, r12, r7 ; r6 = nonyc_r = crv * (*V - 128) + 32768
ldr r2, [r0, #(4*3)] ; r2 = -cgv
mla r4, r2, r12, r7 ; r4 = - cgv * (*V - 128) + 32768
ldr r3, [r0, #(4*1)] ; r3 = cbu
mla r5, r3, r11, r7 ; r5 = nonyc_b = cbu * (*U - 128) + 32768
sub r9, r9, #16 ; r9 = *Y - 16
mla r7, r8, r9, r6 ; r7 = (*Y - 16) * multy + nonyc_r
ldr r0, [r0, #(4*2)] ; r0 = -cgu
mla r4, r0, r11, r4 ; r4 = nonyc_g = - cgu * (*U - 128) + r4 = - cgu * (*U - 128) - cgv * (*V - 128) + 32768
add r0, pc, #(rb_clip-.-8) ; r0 contains the pointer to the R and B clipping array
ldrb r7, [r0, r7, asr #(16+3)] ; r7 = R composant
mla r12, r8, r9, r5 ; r12 = (*Y - 16) * multy + nonyc_b
mla r1, r8, r9, r4 ; r1 = (*Y - 16) * multy + nonyc_g
ldrb r12, [r0, r12, asr #(16+3)] ; r12 = B composant (and the start of the RGB word)
add r12, r12, r7, lsl #11 ; r12 = .GB ...
add r11, pc, #(g_clip-.-8) ; r11 now contains the pointer to the G clipping array
ldrb r1, [r11, r1, asr #(16+2)] ; r1 contains the G part of the RGB triplet
add r12, r12, r1, lsl #5 ; r12 = RGB ... (ie the first pixel (half-word) is done)
; --- next pixel
ldrb r9, [r10, #-1] ; r9 = *(Y+1)
sub r9, r9, #16 ; r9 = *(Y+1) - 16
mla r10, r8, r9, r6 ; r10 is the Red part of the RGB triplet
mla r7, r8, r9, r5 ; r7 is the Blue part of the RGB triplet
mla r2, r8, r9, r4 ; r2 is the Green part of the RGB triplet
ldrb r2, [r11, r2, asr #(16+2)] ; r2 = G composant
add r12, r12, r2, lsl #(5+16) ; r12 = RGB .G.
ldrb r7, [r0, r7, asr #(16+3)] ; r7 = B composant
add r12, r12, r7, lsl #(0+16) ; r12 = RGB .GB
ldrb r10, [r0, r10, asr #(16+3)] ; r10 = R composant
add r12, r12, r10, lsl #(11+16) ; r12 = RGB RGB
;---- do store ----
ldr r3, [sp, #4] ; r3 = RGB
add r3, r3, #4 ; r3 = RGB++ (ie next double-pixel)
str r3, [sp, #4] ; store the RGB pointer
str r12, [r3] ; store the rgb pixel at *RGB
;---- next line ----
ldr r1, [sp, #32] ; r1 = Ynext
ldrb r9, [r1] ; r9 = *Ynext
sub r9, r9, #16 ; r9 = *Ynext - 16
mla r2, r8, r9, r4 ; r2 is the Green part of the RGB triplet
mla r7, r8, r9, r5 ; r7 is the Blue part of the RGB triplet
mla r10, r8, r9, r6 ; r10 is the Red part of the RGB triplet
ldrb r12, [r0, r7, asr #(16+3)] ; r12 = ..B ...
ldrb r10, [r0, r10, asr #(16+3)] ; r10 = B composant
add r12, r12, r10, lsl #11 ; r12 = R.B ...
ldrb r2, [r11, r2, asr #(16+2)] ; r2 = G composant
add r12, r12, r2, lsl #5 ; r12 = RGB ...
;---- next pixel
ldrb r9, [r1, #1] ; r9 = *(Ynext+1)
sub r9, r9, #16 ; r9 = *(Ynext+1) - 16
add r1, r1, #2 ; r1 = Ynext + 2
str r1, [sp, #32] ; store the increased Ynext pointer
mla r7, r8, r9, r5 ; r7 is the Blue part of the RGB triplet
mla r10, r8, r9, r6 ; r10 is the Red part of the RGB triplet
mla r2, r8, r9, r4 ; r2 is the Green part of the RGB triplet
ldrb r7, [r0, r7, asr #(16+3)] ; r7 = B composant
add r12, r12, r7, lsl #(16+0) ; r12 = RGB ..B
ldrb r10, [r0, r10, asr #(16+3)] ; r10 = R composant
add r12, r12, r10, lsl #(16+11) ; r12 = RGB R.B
ldrb r2, [r11, r2, asr #(16+2)] ; r2 = G composant
add r12, r12, r2, lsl #(16+5) ; r12 = RGB RGB
;---- do store
ldr r3, [sp, #0] ; r3 = RGBnext pointer
add r3, r3, #4 ; r3 = next pixel on the RGBnext line
str r12, [r3, #-4] ; store the next pixel
str r3, [sp, #0] ; store the increased 'next line' pixel pointer
;-- complete, do loop --
subs lr, lr, #2 ; decrement the line counter
bne yuv_loop ; and restart if not at the end of the line
ldr r0, [sp, #8] ; r0 = saved width
ldr r1, [sp, #0] ; r1 = RGBnext pointer
mov lr, r0 ; lr = saved width (to restart the line counter)
str r1, [sp, #4] ; current RGBnext pointer is next iteration RGB pointer
add r1, r1, r0, lsl #1 ; r1 = update RGBnext to next line
str r1, [sp, #0] ; store updated RGBnext pointer
ldr r3, [sp, #16] ; r3 = (2 * stride_Y) - width
ldr r4, [sp, #28] ; r4 = Y ptr
ldr r5, [sp, #32] ; r5 = Ynext ptr
add r4, r4, r3 ; r4 = Y ptr for the next two lines
add r5, r5, r3 ; r5 = Ynext ptr for the next two lines
str r4, [sp, #28] ; store updated Y pointer
str r5, [sp, #32] ; store update Ynext pointer
ldr r1, [sp, #20] ; r1 = stride_U - (width / 2)
ldr r2, [sp, #36] ; r2 = U ptr
ldr r6, [sp, #12] ; get height counter
add r2, r2, r1 ; update U ptr
str r2, [sp, #36] ; store updated U ptr (and update 'V' at the same time :-) )
subs r6, r6, #2
str r6, [sp, #12]
bne yuv_loop
; Exit cleanly :-)
add sp, sp, #(11*4) ; remove all custom things from stack
ldmia sp!, { r4 - r12, pc } ; restore callee saved regs and return
const_storage
; In order : crv, cbu, - cgu, - cgv, multy
DCD 0x00019895, 0x00020469, 0xffff9bb5, 0xffff2fe1, 0x00012A15
rb_clip_dummy
DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
rb_clip
DCB 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
DCB 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
DCB 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
DCB 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
DCB 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
g_clip_dummy
DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
g_clip
DCB 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
DCB 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
DCB 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f
DCB 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f
DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
END
2.16.2009
YUV to RGB in ARMv4 assembly
(http://www.koders.com/noncode/fid9B79A2EAD6C3F6EE8454AB93E5D9F77A1C509D19.aspx?s=mp3)
用 assembly 寫的,因為看到 版權宣告是 free 的,所以把全部內容都貼出來:
/*
Copyright (c) 2001 Lionel Ulmer ([email protected] / [email protected])
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/* WARNING : this function only works when stride_U == stride_V (I use some hacks to
not have to do too many computations at line's end)...
C-like prototype :
void convert_yuv420_rgb565(AVPicture *picture, unsigned char *results, unsigned char *_py,unsigned char *_pu,unsigned char *_pv);
*/
#ifdef __arm__
.text
.align
.global convert_yuv420_rgb565
convert_yuv420_rgb565:
stmdb sp!, { r4 - r12, lr } @ all callee saved regs
ldr r7,[r0,#0] @dest width
ldr r9,[r0,#4] @dest height
ldr r10,[r0, #12] @ sourcewidth
ldr r5, [r0, #16] @ source_height
mul r4,r5,r10 @zll
mov r4,r4,lsr#2 @ Vptr - U ptr
ldr r6, [r0, #20] @ rgbstrid
ldr r8,[r0, #12] @zll width
add r8, r8, r2 @ Y + stride_Y
add r0,r1,r6 @ RGB + 1
stmdb sp!, { r0-r10 }
@ Stack description :
@ (sp+ 0) RGB + one line r0
@ (sp+ 4) RGB r1
@ (sp+ 8) _py r2
@ (sp+12) _pu r3
@ (sp+16) _pv - _pu r4
@ (sp+20) sourceheight r5
@ (sp+24) rgbstrid r6
@ (sp+28) destwidth r7
@ (sp+32) Ynext r8
@ (sp+36) destheight r9
@ (sp+40) sourcewidth r10
mov lr,r10 @ Initialize the width counter
add r0, pc, #(const_storage-.-8) @ r0 = base pointer to the constants array
ldr r8, [r0, #(4*4)] @ r8 = multy
yuv_loop:
add r0, pc, #(const_storage-.-8) @ r0 = base pointer to the constants array
ldr r10, [sp, #8] @ r10 = Y ...
ldr r1, [sp, #12] @ r1 = U ...
ldrb r9, [r10, #0] @ r9 = *Y ...
ldrb r11, [r1] @ r11 = *U
add r1, r1, #1 @ r1 = U++
ldr r2, [sp, #16] @ r2 = V - U ...
str r1, [sp, #12] @ store U++
add r2, r2, r1 @ r2 = V+1
ldrb r12, [r2, #-1] @ r12 = *V
sub r11, r11, #128 @ r11 = *U - 128
sub r12, r12, #128 @ r12 = *V - 128
ldr r1, [r0, #(4*0)] @ r1 = crv
mov r7, #32768 @ r7 = 32768 (for additions in MLA)
ldr r2, [r0, #(4*3)] @ r2 = -cgv
mla r6, r1, r12, r7 @ r6 = nonyc_r = crv * (*V - 128) + 32768
ldr r3, [r0, #(4*1)] @ r3 = cbu
mla r4, r2, r12, r7 @ r4 = - cgv * (*V - 128) + 32768
sub r9, r9, #16 @ r9 = *Y - 16
mla r5, r3, r11, r7 @ r5 = nonyc_b = cbu * (*U - 128) + 32768
ldr r0, [r0, #(4*2)] @ r0 = -cgu
mla r7, r8, r9, r6 @ r7 = (*Y - 16) * multy + nonyc_r
add r10, r10, #2 @ r10 = Y + 2
mla r4, r0, r11, r4 @ r4 = nonyc_g = - cgu * (*U - 128) + r4 = - cgu * (*U - 128) - cgv * (*V - 128) + 32768
add r0, pc, #(rb_clip-.-8) @ r0 contains the pointer to the R and B clipping array
mla r12, r8, r9, r5 @ r12 = (*Y - 16) * multy + nonyc_b
ldrb r7, [r0, r7, asr #(16+3)] @ r7 = R composant
mla r1, r8, r9, r4 @ r1 = (*Y - 16) * multy + nonyc_g
ldrb r9, [r10, #-1] @ r9 = *(Y+1)
str r10, [sp, #8] @ save Y + 2
ldrb r12, [r0, r12, asr #(16+3)] @ r12 = B composant (and the start of the RGB word)
add r11, pc, #(g_clip-.-8) @ r11 now contains the pointer to the G clipping array
ldrb r1, [r11, r1, asr #(16+2)] @ r1 contains the G part of the RGB triplet
sub r9, r9, #16 @ r9 = *(Y+1) - 16
mla r10, r8, r9, r6 @ r10 is the Red part of the RGB triplet
add r12, r12, r7, lsl #11 @ r12 = .GB ...
mla r7, r8, r9, r5 @ r7 is the Blue part of the RGB triplet
add r12, r12, r1, lsl #5 @ r12 = RGB ... (ie the first pixel (half-word) is done)
mla r2, r8, r9, r4 @ r2 is the Green part of the RGB triplet
ldrb r10, [r0, r10, asr #(16+3)] @ r10 = R composant
ldrb r7, [r0, r7, asr #(16+3)] @ r7 = B composant
ldr r1, [sp, #32] @ r1 = Ynext
ldrb r2, [r11, r2, asr #(16+2)] @ r2 = G composant
ldrb r9, [r1] @ r9 = *Ynext
add r12, r12, r2, lsl #(5+16) @ r12 = RGB .G.
sub r9, r9, #16 @ r9 = *Ynext - 16
mla r2, r8, r9, r4 @ r2 is the Green part of the RGB triplet
add r12, r12, r7, lsl #(0+16) @ r12 = RGB .GB
mla r7, r8, r9, r5 @ r7 is the Blue part of the RGB triplet
add r12, r12, r10, lsl #(11+16) @ r12 = RGB RGB
ldr r3, [sp, #4] @ r3 = RGB
mla r10, r8, r9, r6 @ r10 is the Red part of the RGB triplet
str r12, [r3] @ store the rgb pixel at *RGB
add r3, r3, #4 @ r3 = RGB++ (ie next double-pixel)
str r3, [sp, #4] @ store the RGB pointer
ldrb r9, [r1, #1] @ r9 = *(Ynext+1)
add r1, r1, #2 @ r1 = Ynext + 2
sub r9, r9, #16 @ r9 = *(Ynext+1) - 16
ldrb r12, [r0, r7, asr #(16+3)] @ r12 = ..B ...
ldrb r10, [r0, r10, asr #(16+3)] @ r10 = B composant
mla r7, r8, r9, r5 @ r7 is the Blue part of the RGB triplet
add r12, r12, r10, lsl #11 @ r12 = R.B ...
ldrb r2, [r11, r2, asr #(16+2)] @ r2 = G composant
mla r10, r8, r9, r6 @ r10 is the Red part of the RGB triplet
add r12, r12, r2, lsl #5 @ r12 = RGB ...
mla r2, r8, r9, r4 @ r2 is the Green part of the RGB triplet
ldrb r7, [r0, r7, asr #(16+3)] @ r7 = B composant
str r1, [sp, #32] @ store the increased Ynext pointer
add r12, r12, r7, lsl #(16+0) @ r12 = RGB ..B
ldrb r10, [r0, r10, asr #(16+3)] @ r10 = R composant
ldr r3, [sp, #0] @ r3 = RGBnext pointer
add r12, r12, r10, lsl #(16+11) @ r12 = RGB R.B
ldrb r2, [r11, r2, asr #(16+2)] @ r2 = G composant
add r3, r3, #4 @ r3 = next pixel on the RGBnext line
add r12, r12, r2, lsl #(16+5) @ r12 = RGB RGB
str r12, [r3, #-4] @ store the next pixel
str r3, [sp, #0] @ store the increased 'next line' pixel pointer
subs lr, lr, #2 @ decrement the line counter
bne yuv_loop @ and restart if not at the end of the line
ldr r0, [sp, #40] @ r0 = saved sourcewidth ....
ldr r1, [sp, #0] @ r1 = RGBnext pointer
ldr r2, [sp, #24] @ zll rgbstrid
mov lr, r0 @ lr = saved width (to restart the line counter)
subs r3,r2,r0,lsl#1 @ (rgbstride - 2 width)
add r1,r1,r3 @ the nest two RGBline
str r1, [sp, #4] @ current RGBnext pointer is next iteration RGB pointer
add r1,r1,r2 @ r1 = update RGBnext to next line
str r1, [sp, #0] @ store updated RGBnext pointer
ldr r3, [sp, #40] @ sourcewidth
ldr r4, [sp, #8] @ r4 = Y ptr
ldr r5, [sp, #32] @ r5 = Ynext ptr
add r4, r4, r3 @ r4 = Y ptr for the next two lines
add r5, r5, r3 @ r5 = Ynext ptr for the next two lines
str r4, [sp, #8] @ store updated Y pointer
str r5, [sp, #32] @ store update Ynext pointer
ldr r6, [sp, #20] @ get height counter
subs r6, r6, #2
str r6, [sp, #20]
bne yuv_loop
@ Exit cleanly :-)
add sp, sp, #(11*4) @ remove all custom things from stack
ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return
const_storage:
@ In order : crv, cbu, - cgu, - cgv, multy
.word 0x00019895, 0x00020469, 0xffff9bb5, 0xffff2fe1, 0x00012A15
rb_clip_dummy:
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
rb_clip:
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
.byte 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
.byte 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
.byte 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
.byte 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
g_clip_dummy:
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
g_clip:
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
.byte 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
.byte 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f
.byte 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
#endif