xpra icon
Bug tracker and wiki

This bug tracker and wiki are being discontinued
please use https://github.com/Xpra-org/xpra instead.


Ticket #437: csc_opencl-nofloat-v2.patch

File csc_opencl-nofloat-v2.patch, 23.5 KB (added by Antoine Martin, 8 years ago)

implements all calculations using integers

  • xpra/codecs/csc_opencl/opencl_kernels.py

     
    1 # Copyright (C) 2011 Michael Zucchi
    2 # This file is based on code from socles, an OpenCL image processing library.
     1# This file is part of Xpra.
     2# Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk>
     3# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
     4# later version. See the file COPYING for details.
     5
     6# This file is vaguely inspired by code from socles, an OpenCL image processing library.
    37#
    4 # socles is free software: you can redistribute it and/or modify
    5 # it under the terms of the GNU General Public License as published by
    6 # the Free Software Foundation, either version 3 of the License, or
    7 # (at your option) any later version.
    8 #
    9 # socles is distributed in the hope that it will be useful,
    10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
    11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    12 # GNU General Public License for more details.
    13 #
    14 # You should have received a copy of the GNU General Public License
    15 # along with socles.  If not, see <http://www.gnu.org/licenses/>.
     8# Notes:
     9# * we use integer arithmetic by pre-multiplying coefficients by 2*20
     10#   and use a fast bit shift to get the result as an 8-bit unsigned char.
     11# * each Y/U/V channel is passed in as a single channel image2d
     12# * we allow downscaling
     13# * the image sampler is passed in (responsibility of the caller to choose the right one)
     14# * we deal with odd sized images gracefully by clamping the output (via runtime checks)
     15#   as well as the input (if sampler_t uses CLAMP_TO_EDGE)
    1616
    1717
    18 YUV_TO_RGB = {"X"    : "1.0",
    19               "A"    : "1.0",
    20               "R"    : "Y + 1.5958 * Cb",
    21               "G"    : "Y - 0.39173*Cr-0.81290*Cb",
    22               "B"    : "Y + 2.017*Cr"
     18YUV_TO_RGB = {"X"    : [1.0],
     19              "A"    : [1.0],
     20              "R"    : [1.0, "*", "Y", "+", 1.5958,  "*", "Cb"],
     21              "G"    : [1.0, "*", "Y", "-", 0.39173, "*", "Cr", "-", 0.8129, "*", "Cb"],
     22              "B"    : [1.0, "*", "Y", "+", 2.017,   "*", "Cr"],
    2323              }
    2424
    25 #Cr width div, Cr heigth div, Cb width div, Cb width div
     25def get_RGB_formulae(rgb_channel, multiplier=2**20):
     26    #given an RGB channel (R, G or B), return the formulae for it
     27    #which uses the named variables Y, Cb (aka U) and Cr (aka V)
     28    f = YUV_TO_RGB[rgb_channel]     #ie: ["Y", "+", 1.5958, "*", "Cb"]
     29    mf = []
     30    for x in f:
     31        if type(x)==float:
     32            x = int(round(x*multiplier))    #1.5958 -> 1673318
     33        mf.append(x)
     34    return " ".join([str(x) for x in mf])
     35
     36
    2637YUV_FORMATS = ("YUV444P", "YUV422P", "YUV420P")
    2738
    2839def indexes_to_rgb_mode(RGB_args):
     
    7081    RGB_args = rgb_mode_to_indexes(rgb_format)
    7182    assert len(RGB_args)==4, "we need 4 RGB components (R,G,B and A or X), not: %s" % RGB_args
    7283    kname = "%s_to_%s" % (yuv_format, indexes_to_rgb_mode(RGB_args))
    73     args = tuple([kname] + [YUV_TO_RGB[c] for c in rgb_format])
     84    rgb_args = [get_RGB_formulae(x) for x in rgb_format]
     85    args = tuple([kname] + rgb_args)
    7486    kstr = """
    7587__kernel void %s(read_only image2d_t srcY, read_only image2d_t srcU, read_only image2d_t srcV,
    76               uint srcw, uint srch, uint w, uint h,
     88              const uint srcw, const uint srch, const uint w, const uint h,
    7789              const sampler_t sampler, write_only image2d_t dst) {
    78     uint gx = get_global_id(0);
    79     uint gy = get_global_id(1);
    80     uint srcx = gx*srcw/w;
    81     uint srcy = gy*srch/h;
     90    const uint gx = get_global_id(0);
     91    const uint gy = get_global_id(1);
     92    const uint srcx = gx*srcw/w;
     93    const uint srcy = gy*srch/h;
    8294
    8395    if ((gx < w) & (gy < h)) {
    84         float4 p;
     96        uint4 p;
    8597
    86         int2 src = (int2)( srcx, srcy );
    87         float Y = 1.1643 * read_imagef(srcY, sampler, src).s0 - 0.0625;
    88         float Cr = read_imagef(srcU, sampler, src).s0 - 0.5f;
    89         float Cb = read_imagef(srcV, sampler, src).s0 - 0.5f;
     98        const int2 src = (int2)( srcx, srcy );
     99        const int Y  = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536;
     100        const int Cr = 1048576 * read_imageui(srcU, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288;
     101        const int Cb = 1048576 * read_imageui(srcV, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288;
    90102
    91         p.s0 = %s;
    92         p.s1 = %s;
    93         p.s2 = %s;
    94         p.s3 = %s;
     103        p.s0 = convert_uchar_sat_rte(%s>>20);
     104        p.s1 = convert_uchar_sat_rte(%s>>20);
     105        p.s2 = convert_uchar_sat_rte(%s>>20);
     106        p.s3 = convert_uchar_sat_rte(%s>>20);
    95107
    96         write_imagef(dst, (int2)( gx, gy ), p);
     108        write_imageui(dst, (int2)( gx, gy ), p);
    97109    }
    98110}
    99111"""
     
    107119    RGB_args = rgb_mode_to_indexes(rgb_format)
    108120    assert len(RGB_args)==4, "we need 4 RGB components (R,G,B and A or X), not: %s" % RGB_args
    109121    kname = "%s_to_%s" % (yuv_format, indexes_to_rgb_mode(RGB_args))
    110     args = tuple([kname] + [YUV_TO_RGB[c] for c in rgb_format]*2)
     122    rgb_args = [get_RGB_formulae(x) for x in rgb_format]
     123    args = tuple([kname] + rgb_args*2)
    111124    kstr = """
    112125__kernel void %s(read_only image2d_t srcY, read_only image2d_t srcU, read_only image2d_t srcV,
    113               uint srcw, uint srch, uint w, uint h,
     126              const uint srcw, const uint srch, const uint w, const uint h,
    114127              const sampler_t sampler, write_only image2d_t dst) {
    115     uint gx = get_global_id(0);
    116     uint gy = get_global_id(1);
     128    const uint gx = get_global_id(0);
     129    const uint gy = get_global_id(1);
    117130
    118131    if ((gx*2 < w) & (gy < h)) {
    119         float4 p;
     132        uint4 p;
    120133
    121134        uint srcx = gx*2*srcw/w;
    122         uint srcy = gy*srch/h;
    123         float Y = 1.1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625;
    124         float Cr = read_imagef(srcU, sampler, (int2)( srcx/2, srcy )).s0 - 0.5f;
    125         float Cb = read_imagef(srcV, sampler, (int2)( srcx/2, srcy )).s0 - 0.5f;
     135        const uint srcy = gy*srch/h;
     136        int Y         = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536;
     137        const int Cr  = 1048576 * read_imageui(srcU, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288;
     138        const int Cb  = 1048576 * read_imageui(srcV, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288;
    126139
    127         p.s0 = %s;
    128         p.s1 = %s;
    129         p.s2 = %s;
    130         p.s3 = %s;
     140        p.s0 = convert_uchar_sat_rte(%s>>20);
     141        p.s1 = convert_uchar_sat_rte(%s>>20);
     142        p.s2 = convert_uchar_sat_rte(%s>>20);
     143        p.s3 = convert_uchar_sat_rte(%s>>20);
    131144
    132         write_imagef(dst, (int2)( gx*2, gy ), p);
     145        write_imageui(dst, (int2)( gx*2, gy ), p);
    133146
    134147        if (gx*2+1 < w) {
    135148            srcx = (gx*2+1)*srcw/w;
    136             Y = 1.1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625;
     149            Y = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536;
    137150
    138             p.s0 = %s;
    139             p.s1 = %s;
    140             p.s2 = %s;
    141             p.s3 = %s;
     151            p.s0 = convert_uchar_sat_rte(%s>>20);
     152            p.s1 = convert_uchar_sat_rte(%s>>20);
     153            p.s2 = convert_uchar_sat_rte(%s>>20);
     154            p.s3 = convert_uchar_sat_rte(%s>>20);
    142155
    143             write_imagef(dst, (int2)( gx*2+1, gy ), p);
     156            write_imageui(dst, (int2)( gx*2+1, gy ), p);
    144157        }
    145158    }
    146159}
     
    155168    RGB_args = rgb_mode_to_indexes(rgb_format)
    156169    assert len(RGB_args)==4, "we need 4 RGB components (R,G,B and A or X), not: %s" % RGB_args
    157170    kname = "%s_to_%s" % (yuv_format, indexes_to_rgb_mode(RGB_args))
    158     args = tuple([kname] + [YUV_TO_RGB[c] for c in rgb_format]*4)
     171    #convert rgb_format into list of 4 channel values:
     172    rgb_args = [get_RGB_formulae(x) for x in rgb_format]
     173    args = tuple([kname] + rgb_args*4)
    159174    kstr = """
    160175__kernel void %s(read_only image2d_t srcY, read_only image2d_t srcU, read_only image2d_t srcV,
    161               uint srcw, uint srch, uint w, uint h,
     176              const uint srcw, const uint srch, const uint w, const uint h,
    162177              const sampler_t sampler, write_only image2d_t dst) {
    163     uint gx = get_global_id(0);
    164     uint gy = get_global_id(1);
     178    const uint gx = get_global_id(0);
     179    const uint gy = get_global_id(1);
    165180
    166     uint x = gx*2;
    167     uint y = gy*2;
     181    const uint x = gx*2;
     182    const uint y = gy*2;
    168183    if ((x < w) & (y < h)) {
    169         float4 p;
     184        uint4 p;
    170185
    171         uint srcx = gx*2*srcw/w;
    172         uint srcy = gy*2*srch/h;
    173         float Y = 1.1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625;
    174         float Cr = read_imagef(srcU, sampler, (int2)( srcx/2, srcy/2 )).s0 - 0.5f;
    175         float Cb = read_imagef(srcV, sampler, (int2)( srcx/2, srcy/2 )).s0 - 0.5f;
     186        uint srcx = x*srcw/w;
     187        uint srcy = y*srch/h;
    176188
    177         p.s0 = %s;
    178         p.s1 = %s;
    179         p.s2 = %s;
    180         p.s3 = %s;
     189        //Y = 1.1643 * p.s0 - 0.0625
     190        //Y*2**20  = 1220857 * v - 65536
     191        //Cb*2**20 = 2**20 * v - 2**19;
     192        //Cr*2**20 = 2**20 * v - 2**19;
     193        int Y         = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536;
     194        const int Cr  = 1048576 * read_imageui(srcU, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288;
     195        const int Cb  = 1048576 * read_imageui(srcV, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288;
    181196
    182         write_imagef(dst, (int2)( x, y ), p);
     197        p.s0 = convert_uchar_sat_rte(%s>>20);
     198        p.s1 = convert_uchar_sat_rte(%s>>20);
     199        p.s2 = convert_uchar_sat_rte(%s>>20);
     200        p.s3 = convert_uchar_sat_rte(%s>>20);
    183201
     202        write_imageui(dst, (int2)( x, y ), p);
     203
    184204        if (x+1 < w) {
    185             srcx = (gx*2+1)*srcw/w;
    186             Y = 1.1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625;
     205            srcx = (x+1)*srcw/w;
     206            Y = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536;
    187207
    188             p.s0 = %s;
    189             p.s1 = %s;
    190             p.s2 = %s;
    191             p.s3 = %s;
     208            p.s0 = convert_uchar_sat_rte(%s>>20);
     209            p.s1 = convert_uchar_sat_rte(%s>>20);
     210            p.s2 = convert_uchar_sat_rte(%s>>20);
     211            p.s3 = convert_uchar_sat_rte(%s>>20);
    192212
    193             write_imagef(dst, (int2)( x+1, y ), p);
     213            write_imageui(dst, (int2)( x+1, y ), p);
    194214        }
    195215
    196216        if (y+1 < h) {
    197             srcx = gx*2*srcw/w;
    198             srcy = (gy*2+1)*srch/h;
    199             Y = 1.1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625;
     217            srcx = x*srcw/w;
     218            srcy = (y+1)*srch/h;
     219            Y = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536;
    200220
    201             p.s0 = %s;
    202             p.s1 = %s;
    203             p.s2 = %s;
    204             p.s3 = %s;
     221            p.s0 = convert_uchar_sat_rte(%s>>20);
     222            p.s1 = convert_uchar_sat_rte(%s>>20);
     223            p.s2 = convert_uchar_sat_rte(%s>>20);
     224            p.s3 = convert_uchar_sat_rte(%s>>20);
    205225
    206             write_imagef(dst, (int2)( x, y+1 ), p);
     226            write_imageui(dst, (int2)( x, y+1 ), p);
    207227
    208228            if (x+1 < w) {
    209                 srcx = (gx*2+1)*srcw/w;
    210                 Y = 1.1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625;
     229                srcx = (x+1)*srcw/w;
     230                Y = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536;
    211231
    212                 p.s0 = %s;
    213                 p.s1 = %s;
    214                 p.s2 = %s;
    215                 p.s3 = %s;
     232                p.s0 = convert_uchar_sat_rte(%s>>20);
     233                p.s1 = convert_uchar_sat_rte(%s>>20);
     234                p.s2 = convert_uchar_sat_rte(%s>>20);
     235                p.s3 = convert_uchar_sat_rte(%s>>20);
    216236
    217                 write_imagef(dst, (int2)( x+1, y+1 ), p);
     237                write_imageui(dst, (int2)( x+1, y+1 ), p);
    218238            }
    219239        }
    220240    }
     
    236256    return YUV_to_RGB_KERNELS
    237257
    238258
     259RGB_TO_YUV = {"Y"   : [0.257,  "*", "R", "+", 0.504, "*", "G", "+", 0.098, "*", "B", "+", 16],
     260              "U"   : [-0.148, "*", "R", "-", 0.291, "*", "G", "+", 0.439, "*", "B", "+", 128],
     261              "V"   : [0.439,  "*", "R", "-", 0.368, "*", "G", "-", 0.071, "*", "B", "+", 128],
     262              }
    239263
     264def get_YUV_formulae(yuv_channel, fmult=2**20, imult=2**20):
     265    #given an YUV channel (Y, U or V), return the formulae for it
     266    #which uses the named variables R, G and B
     267    f = RGB_TO_YUV[yuv_channel]     #ie: ["Y", "+", 1.5958, "*", "Cb"]
     268    mf = []
     269    for x in f:
     270        if type(x)==float:
     271            x = int(round(x*fmult))         #-0.148 -> -155189
     272        if type(x) in (float, int):
     273            x = int(round(x*imult))         #16 -> 16777216
     274        mf.append(x)
     275    return " ".join([str(x) for x in mf])
    240276
     277def get_YUV(yuv_channel, R, G, B, exp=0):
     278    p = 20
     279    f = get_YUV_formulae(yuv_channel, fmult=2**p, imult=2**(p+exp))
     280    #substitute R, G and B:
     281    f = f.replace("R", R).replace("G", G).replace("B", B)
     282    return "(%s)>>%s" % (f, p+exp)
     283
    241284def gen_rgb_to_yuv444p_kernel(rgb_mode):
    242285    RGB_args = rgb_indexes(rgb_mode)
    243     #kernel args: R, G, B are used 3 times each:
     286    R = RGB_args[0]                     #ie: 0
     287    G = RGB_args[1]                     #ie: 1
     288    B = RGB_args[2]                     #ie: 2
    244289    kname = "%s_to_YUV444P" % indexes_to_rgb_mode(RGB_args)
    245     args = tuple([kname]+RGB_args*3)
     290    #kernel args:
     291    args = [kname]
     292    #consts:
     293    pR = "p.s%s" % R
     294    pG = "p.s%s" % G
     295    pB = "p.s%s" % B
     296    #one U pixel with the sum:
     297    Y = get_YUV("Y", pR, pG, pB)
     298    U = get_YUV("U", pR, pG, pB)
     299    V = get_YUV("V", pR, pG, pB)
     300    args += [Y, U, V]
    246301
    247302    kstr = """
    248303__kernel void %s(read_only image2d_t src,
    249               uint srcw, uint srch, uint w, uint h,
     304              const uint srcw, const uint srch, const uint w, const uint h,
    250305              const sampler_t sampler,
    251               global uchar *dstY, uint strideY,
    252               global uchar *dstU, uint strideU,
    253               global uchar *dstV, uint strideV) {
    254     uint gx = get_global_id(0);
    255     uint gy = get_global_id(1);
     306              global uchar *dstY, const uint strideY,
     307              global uchar *dstU, const uint strideU,
     308              global uchar *dstV, const uint strideV) {
     309    const uint gx = get_global_id(0);
     310    const uint gy = get_global_id(1);
    256311
    257312    if ((gx < w) & (gy < h)) {
    258         uint4 p = read_imageui(src, sampler, (int2)( (gx*srcw)/w, (gy*srch)/h ));
     313        const uint4 p = read_imageui(src, sampler, (int2)( (gx*srcw)/w, (gy*srch)/h ));
    259314
    260         float Y =  (0.257 * p.s%s + 0.504 * p.s%s + 0.098 * p.s%s + 16);
    261         float U = (-0.148 * p.s%s - 0.291 * p.s%s + 0.439 * p.s%s + 128);
    262         float V =  (0.439 * p.s%s - 0.368 * p.s%s - 0.071 * p.s%s + 128);
    263 
    264         dstY[gx + gy*strideY] = convert_uchar_rte(Y);
    265         dstU[gx + gy*strideU] = convert_uchar_rte(U);
    266         dstV[gx + gy*strideV] = convert_uchar_rte(V);
     315        dstY[gx + gy*strideY] = convert_uchar_sat_rte(%s);
     316        dstU[gx + gy*strideU] = convert_uchar_sat_rte(%s);
     317        dstV[gx + gy*strideV] = convert_uchar_sat_rte(%s);
    267318    }
    268319}
    269320"""
    270     return kname, kstr % args
     321    return kname, kstr % tuple(args)
    271322
    272323def gen_rgb_to_yuv422p_kernel(rgb_mode):
    273324    RGB_args = rgb_indexes(rgb_mode)
    274     #kernel args: R, G, B are used 6 times each:
     325    R = RGB_args[0]                     #ie: 0
     326    G = RGB_args[1]                     #ie: 1
     327    B = RGB_args[2]                     #ie: 2
    275328    kname = "%s_to_YUV422P" % indexes_to_rgb_mode(RGB_args)
    276     args = tuple([kname]+RGB_args*6)
     329    #kernel args:
     330    args = [kname]
     331    #2 Y pixels:
     332    for i in range(2):
     333        Y = get_YUV("Y", "p[%s].s%s" % (i, R), "p[%s].s%s" % (i, G), "p[%s].s%s" % (i, B))
     334        args.append(Y)
     335    #consts:
     336    RR = "+".join(["p[%s].s%s" % (i, R) for i in range(2)])
     337    GG = "+".join(["p[%s].s%s" % (i, G) for i in range(2)])
     338    BB = "+".join(["p[%s].s%s" % (i, B) for i in range(2)])
     339    #one U pixel with the sum:
     340    U = get_YUV("U", "R", "G", "B", exp=1)
     341    V = get_YUV("V", "R", "G", "B", exp=1)
     342    args += [RR, GG, BB, U, V]
    277343
    278344    kstr = """
    279345__kernel void %s(read_only image2d_t src,
     
    282348              global uchar *dstY, uint strideY,
    283349              global uchar *dstU, uint strideU,
    284350              global uchar *dstV, uint strideV) {
    285     uint gx = get_global_id(0);
    286     uint gy = get_global_id(1);
     351    const uint gx = get_global_id(0);
     352    const uint gy = get_global_id(1);
    287353
    288354    if ((gx*2 < w) & (gy < h)) {
    289355        uint srcx = gx*2*srcw/w;
    290         uint srcy = gy*srch/h;
    291         uint4 p1 = read_imageui(src, sampler, (int2)( srcx, srcy ));
    292         uint4 p2 = p1;
     356        const uint srcy = gy*srch/h;
     357        uint4 p[2];
     358        p[0] = read_imageui(src, sampler, (int2)( srcx, srcy ));
     359        p[1] = p[0];
    293360
    294361        //write up to 2 Y pixels:
    295         float Y1 =  (0.257 * p1.s%s + 0.504 * p1.s%s + 0.098 * p1.s%s + 16);
    296         uint i = gx*2 + gy*strideY;
    297         dstY[i] = convert_uchar_rte(Y1);
     362        const uint i = gx*2 + gy*strideY;
     363        dstY[i] = convert_uchar_sat_rte(%s);
    298364        //we process two pixels at a time
    299365        //if the source width is odd, this destination pixel may not exist (right edge of picture)
    300366        //(we only read it via CLAMP_TO_EDGE to calculate U and V, which do exist)
    301367        if (gx*2+1 < w) {
    302368            srcx = (gx*2+1)*srcw/w;
    303             p2 = read_imageui(src, sampler, (int2)( srcx, srcy ));
    304             float Y2 =  (0.257 * p2.s%s + 0.504 * p2.s%s + 0.098 * p2.s%s + 16);
    305             dstY[i+1] = convert_uchar_rte(Y2);
     369            p[1] = read_imageui(src, sampler, (int2)( srcx, srcy ));
     370            dstY[i+1] = convert_uchar_sat_rte(%s);
    306371        }
    307372
     373        const int R = %s;
     374        const int G = %s;
     375        const int B = %s;
    308376        //write 1 U pixel:
    309         float U1 = (-0.148 * p1.s%s - 0.291 * p1.s%s + 0.439 * p1.s%s + 128);
    310         float U2 = (-0.148 * p2.s%s - 0.291 * p2.s%s + 0.439 * p2.s%s + 128);
    311         //some algorithms just ignore U2, we do not and use an average
    312         //dstU[gx + gy*strideU] = convert_uchar_rte(U1);
    313         dstU[gx + gy*strideU] = convert_uchar_rte((U1+U2)/2.0);
    314 
     377        dstU[gx + gy*strideU] = convert_uchar_sat_rte(%s);
    315378        //write 1 V pixel:
    316         float V1 =  (0.439 * p1.s%s - 0.368 * p1.s%s - 0.071 * p1.s%s + 128);
    317         float V2 =  (0.439 * p2.s%s - 0.368 * p2.s%s - 0.071 * p2.s%s + 128);
    318         //some algorithms just ignore V2, we do not and use an average
    319         //dstV[gx + gy*strideV] = convert_uchar_rte(V1);
    320         dstV[gx + gy*strideV] = convert_uchar_rte((V1+V2)/2.0);
     379        dstV[gx + gy*strideV] = convert_uchar_sat_rte(%s);
    321380    }
    322381}
    323382"""
    324     return kname, kstr % args
     383    return kname, kstr % tuple(args)
    325384
    326 
    327385def gen_rgb_to_yuv420p_kernel(rgb_mode):
    328     RGB_args = rgb_indexes(rgb_mode)
    329     #kernel args: R, G, B are used 12 times each:
    330     kname = "%s_to_YUV420P" % indexes_to_rgb_mode(RGB_args)
    331     args = tuple([kname]+RGB_args*12)
    332 
     386    RGB_args = rgb_indexes(rgb_mode)    #BGRX -> [2, 1, 0]
     387    R = RGB_args[0]                     #ie: 0
     388    G = RGB_args[1]                     #ie: 1
     389    B = RGB_args[2]                     #ie: 2
     390    kname = "%s_to_YUV420P" % indexes_to_rgb_mode(RGB_args)     # [0, 1, 2] -> RGB
     391    #kernel args:
     392    args = [kname]
     393    #4 Y pixels:
     394    for i in range(4):
     395        Y = get_YUV("Y", "p[%s].s%s" % (i, R), "p[%s].s%s" % (i, G), "p[%s].s%s" % (i, B))
     396        #ie: (roundint(0.257*2**20) * p[i].s2 + roundint(0.504*2**20) * p[i].s1 + roundint(0.098*2**20) * p[i].s0 + 16*2*20)>>20
     397        args.append(Y)
     398    #consts:
     399    RRRR = "+".join(["p[%s].s%s" % (i, R) for i in range(4)])
     400    GGGG = "+".join(["p[%s].s%s" % (i, G) for i in range(4)])
     401    BBBB = "+".join(["p[%s].s%s" % (i, B) for i in range(4)])
     402    #one U pixel with the sum:
     403    U = get_YUV("U", "R", "G", "B", exp=2)
     404    V = get_YUV("V", "R", "G", "B", exp=2)
     405    args += [RRRR, GGGG, BBBB, U, V]
    333406    kstr = """
    334407__kernel void %s(read_only image2d_t src,
    335               uint srcw, uint srch, uint w, uint h,
     408              const uint srcw, const uint srch, const uint w, const uint h,
    336409              const sampler_t sampler,
    337               global uchar *dstY, uint strideY,
    338               global uchar *dstU, uint strideU,
    339               global uchar *dstV, uint strideV) {
    340     uint gx = get_global_id(0);
    341     uint gy = get_global_id(1);
     410              global uchar *dstY, const uint strideY,
     411              global uchar *dstU, const uint strideU,
     412              global uchar *dstV, const uint strideV) {
     413    const uint gx = get_global_id(0);
     414    const uint gy = get_global_id(1);
    342415
    343416    if ((gx*2 < w) & (gy*2 < h)) {
    344417        uint srcx = gx*2*srcw/w;
    345418        uint srcy = gy*2*srch/h;
    346         uint4 p1 = read_imageui(src, sampler, (int2)( srcx, srcy ));
    347         uint4 p2 = p1;
    348         uint4 p3 = p1;
    349         uint4 p4 = p1;
     419        uint4 p[4];
     420        p[0] = read_imageui(src, sampler, (int2)( srcx, srcy ));
     421        p[1] = p[0];
     422        p[2] = p[0];
     423        p[3] = p[0];
    350424
    351         //write up to 4 Y pixels:
    352         float Y1 =  (0.257 * p1.s%s + 0.504 * p1.s%s + 0.098 * p1.s%s + 16);
    353         //same logic as 422P for missing pixels:
    354425        uint i = gx*2 + gy*2*strideY;
    355         dstY[i] = convert_uchar_rte(Y1);
     426        dstY[i] = convert_uchar_sat_rte(%s);
    356427        if (gx*2+1 < w) {
    357428            srcx = (gx*2+1)*srcw/w;
    358             p2 = read_imageui(src, sampler, (int2)( srcx, srcy ));
    359             float Y2 =  (0.257 * p2.s%s + 0.504 * p2.s%s + 0.098 * p2.s%s + 16);
    360             dstY[i+1] = convert_uchar_rte(Y2);
     429            p[1] = read_imageui(src, sampler, (int2)( srcx, srcy ));
     430            dstY[i+1] = convert_uchar_sat_rte(%s);
    361431        }
    362432        if (gy*2+1 < h) {
    363433            i += strideY;
    364434            srcx = gx*2*srcw/w;
    365435            srcy = (gy*2+1)*srch/h;
    366             p3 = read_imageui(src, sampler, (int2)( srcx, srcy ));
    367             float Y3 =  (0.257 * p3.s%s + 0.504 * p3.s%s + 0.098 * p3.s%s + 16);
    368             dstY[i] = convert_uchar_rte(Y3);
     436            p[2] = read_imageui(src, sampler, (int2)( srcx, srcy ));
     437            dstY[i] = convert_uchar_sat_rte(%s);
    369438            if (gx*2+1 < w) {
    370439                srcx = (gx*2+1)*srcw/w;
    371                 p4 = read_imageui(src, sampler, (int2)( srcx, srcy ));
    372                 float Y4 =  (0.257 * p4.s%s + 0.504 * p4.s%s + 0.098 * p4.s%s + 16);
    373                 dstY[i+1] = convert_uchar_rte(Y4);
     440                p[3] = read_imageui(src, sampler, (int2)( srcx, srcy ));
     441                dstY[i+1] = convert_uchar_sat_rte(%s);
    374442            }
    375443        }
    376444
     445        const int R = %s;
     446        const int G = %s;
     447        const int B = %s;
    377448        //write 1 U pixel:
    378         float U1 = (-0.148 * p1.s%s - 0.291 * p1.s%s + 0.439 * p1.s%s + 128);
    379         float U2 = (-0.148 * p2.s%s - 0.291 * p2.s%s + 0.439 * p2.s%s + 128);
    380         float U3 = (-0.148 * p3.s%s - 0.291 * p3.s%s + 0.439 * p3.s%s + 128);
    381         float U4 = (-0.148 * p4.s%s - 0.291 * p4.s%s + 0.439 * p4.s%s + 128);
    382         dstU[gx + gy*strideU] = convert_uchar_rte((U1+U2+U3+U4)/4.0);
    383 
     449        dstU[gx + gy*strideU] = convert_uchar_sat_rte(%s);
    384450        //write 1 V pixel:
    385         float V1 =  (0.439 * p1.s%s - 0.368 * p1.s%s - 0.071 * p1.s%s + 128);
    386         float V2 =  (0.439 * p2.s%s - 0.368 * p2.s%s - 0.071 * p2.s%s + 128);
    387         float V3 =  (0.439 * p3.s%s - 0.368 * p3.s%s - 0.071 * p3.s%s + 128);
    388         float V4 =  (0.439 * p4.s%s - 0.368 * p4.s%s - 0.071 * p4.s%s + 128);
    389         dstV[gx + gy*strideV] = convert_uchar_rte((V1+V2+V3+V4)/4.0);
     451        dstV[gx + gy*strideV] = convert_uchar_sat_rte(%s);
    390452    }
    391453}
    392454"""
    393     return kname, kstr % args
     455    return kname, kstr % tuple(args)
    394456
    395457
    396458RGB_to_YUV_generators = {