DelphiのTBitmap用　画像縮小コード

前に作ってたソフトで使っていたソースを公開。
DHGL(Delphi High-Level Graphic Library)のソースを元に加工しています。
Delphiだけでは最適化が難しいので、一部C++でSSE命令で記述しDLL化したものをcall。
SSE使用法の参考にもなるかと。

Delphi側

unit BitmapUtils;

interface

uses Windows, Classes, Graphics, SysUtils;


function Shrink_SSE(Bitmap: TBitmap; Width, Height: Integer): TBitmap;

type
    TTriple = packed record
        B, G, R: Byte;
    end;
    TTripleArray = array[0..40000000] of TTriple;
    PTripleArray = ^TTripleArray;

procedure Shrink_2(S, N:pointer; SW, SH, DW, DH:integer); stdcall; external 'exf.dll';

implementation

function Shrink_SSE(Bitmap: TBitmap; Width, Height: Integer): TBitmap;
type
    TDoubleTriple = record
        B, G, R: double;
    end;
var
    NewBitmap, SourceBitmap: TBitmap;

    // 変換元のビットマップの大きさ
    SourceWidth, SourceHeight: Integer;

    // スキャンラインポインタのキャッシュ
    SourceScans, NewScans: array of PTripleArray;
    
    i: Integer;

    ScanLinePtr0  : Pointer;
    ScanLineOffset: Longint;
begin
    SourceWidth := Bitmap.Width; SourceHeight := Bitmap.Height;
    
    NewBitmap := TBitmap.Create;
    try
        // 変換先 ビットマップを作る
        NewBitmap.PixelFormat := pf24bit;
        NewBitmap.Width := Width; NewBitmap.Height := Height;
        
        // 変換元をフルカラーにする
        SourceBitmap := TBitmap.Create;
        try
            SourceBitmap.Assign(Bitmap);
            SourceBitmap.PixelFormat := pf24Bit;
            
            // スキャンラインポインタのキャッシュを作る
            SetLength(SourceScans, SourceHeight);
            SetLength(NewScans, Height);
            
            ScanLinePtr0:=SourceBitmap.ScanLine[0];
            if(SourceBitmap.Height > 0) then
                ScanLineOffset := Integer(SourceBitmap.ScanLine[1])-Integer(SourceBitmap.ScanLine[0]);
            
            for i := 0 to SourceHeight-1 do
                SourceScans[i] := Pointer(Integer(ScanLinePtr0)+ScanLineOffset*i);
            
            ScanLinePtr0:=NewBitmap.ScanLine[0];
            if(NewBitmap.Height > 0) then
                ScanLineOffset := Integer(NewBitmap.ScanLine[1])-Integer(NewBitmap.ScanLine[0]);
            
            for i := 0 to Height-1 do
                NewScans[i] := Pointer(Integer(ScanLinePtr0)+ScanLineOffset*i);
            
            Shrink_2(SourceScans, NewScans, SourceWidth, SourceHeight, Width, Height);
        finally
            SourceBitmap.Free;
        end;
    except
        NewBitmap.Free;
        Raise;
    end;
    
    Result := NewBitmap;
end;

end.

C++側

#include <xmmintrin.h>

#pragma pack(1)
typedef struct {
    unsigned char B;
    unsigned char G;
    unsigned char R;
} TTriple;
#pragma pack()

typedef __declspec(align(16)) struct {
    float R;
    float G;
    float B;
    float Dummy;
} TFloatTriple;

inline int      trunc_double_sse2(double x)
{
    __asm {
        cvttsd2si eax, x
    }
}

inline int      trunc_float_sse(float x)
{
    __asm {
        cvttss2si eax, x
    }
}

extern "C" void __stdcall Shrink_2(TTriple **sourceScans, TTriple **newScans, int sourceWidth, int sourceHeight, int width, int height);

void __stdcall Shrink_2(TTriple **sourceScans, TTriple **newScans, int sourceWidth, int sourceHeight, int width, int height)
{
    int x, y, i, j;
    TTriple *pSourceScan;
    TTriple *pNewScan;
    double rectTop, rectLeft, rectRight, rectBottom;
    int trunced_rectTop, trunced_rectBottom;
    int rectLeft_i, rectRight_i;
    float ratio;
    double xRatio, yRatio;
    TFloatTriple pixel;
    TTriple sourcePixel;
    float w, h;
    
    __m128 xmmv_ratio_rcp;
    __m128 xmmv_w;
    __m128 xmmv_spx;
    __m128 xmmv_dpx;
    
    
    ratio = (double)(sourceWidth * sourceHeight) / (double)width / (double)height;
    xRatio = (double)sourceWidth / (double)width;
    yRatio = (double)sourceHeight / (double)height;
    
    xmmv_ratio_rcp = _mm_set_ps1(1 / ratio);
    
    for(y=0; y<height; y++) {
        pNewScan = newScans[y];
        
        rectTop    = y * yRatio;
        rectBottom = (y+1) * yRatio - 0.000001;
        
        trunced_rectTop = trunc_double_sse2(rectTop);
        trunced_rectBottom = trunc_double_sse2(rectBottom);
        
        for(x=0; x<width; x++) {
            // 変換先ピクセルを変換元に投影する。
            rectLeft   = x * xRatio;
            rectRight  = (x+1) * xRatio - 0.000001;
            
            rectLeft_i = trunc_double_sse2(rectLeft);
            rectRight_i = trunc_double_sse2(rectRight);

            // 変換元に投影された変換先ピクセルと交わっている
            // 変換元ピクセルを選び出し積分する
            //pixel.R = 0; pixel.G = 0; pixel.B = 0;
            xmmv_dpx = _mm_setzero_ps();

            for(j=trunced_rectTop; j<=trunced_rectBottom; j++) {
                pSourceScan = sourceScans[j];
                
                _mm_prefetch((char *)pSourceScan, _MM_HINT_NTA);
                
                for(i=rectLeft_i; i<=rectRight_i; i++) {
                    sourcePixel = pSourceScan[i];
                    
                    //_mm_prefetch((char *)&sourcePixel, _MM_HINT_NTA);

                    // 投影されたピクセルと変換元ピクセルの交わっている
                    // 部分の大きさを求める
                    if( (rectLeft < i) && ((i+1) < rectRight) )
                        w = 1;
                    else if( (i <= rectLeft) && ((i+1) < rectRight) )
                        w = 1 - (rectLeft - i);
                    else if( (rectLeft < i) && (rectRight <= (i+1)) )
                        w = rectRight - i;
                    else
                        w = rectRight - rectLeft;

                    if( (rectTop < j) && ((j+1) < rectBottom) )
                        h = 1;
                    else if( (j <= rectTop) && ((j+1) < rectBottom) )
                        h = 1 - (rectTop - j);
                    else if( (rectTop < j) && (rectBottom < (j+1)) )
                        h = rectBottom - j;
                    else
                        h = rectBottom - rectTop;
                    
                    xmmv_w = _mm_set_ps1(w * h);
                    xmmv_spx = _mm_setr_ps(sourcePixel.R, sourcePixel.G, sourcePixel.B, 0);
                    
                    xmmv_w = _mm_mul_ps(xmmv_w, xmmv_spx);
                    xmmv_dpx = _mm_add_ps(xmmv_dpx, xmmv_w);
                    
                    // 変換元　1 ピクセル分　積分
                    /*
                    pixel.R = pixel.R + w * h * sourcePixel.R;
                    pixel.G = pixel.G + w * h * sourcePixel.G;
                    pixel.B = pixel.B + w * h * sourcePixel.B;
                    */
                }
            }
            
            xmmv_dpx = _mm_mul_ps(xmmv_dpx, xmmv_ratio_rcp);
            _mm_store_ps((float *)&pixel, xmmv_dpx);
            
            
            // 積分値から平均値を求め変換先に代入する
            /*
            pNewScan[x].R = round(pixel.R / ratio);
            pNewScan[x].G = round(pixel.G / ratio);
            pNewScan[x].B = round(pixel.B / ratio);
            */
            
            pNewScan[x].R = trunc_float_sse(pixel.R);
            pNewScan[x].G = trunc_float_sse(pixel.G);
            pNewScan[x].B = trunc_float_sse(pixel.B);
        }
    }
}