Ticket #437: csc_opencl-nofloat-v2.patch
File csc_opencl-nofloat-v2.patch, 23.5 KB (added by , 8 years ago) |
---|
-
xpra/codecs/csc_opencl/opencl_kernels.py
1 # Copyright (C) 2011 Michael Zucchi 2 # This file is based on code from socles, an OpenCL image processing library. 1 # This file is part of Xpra. 2 # Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk> 3 # Xpra is released under the terms of the GNU GPL v2, or, at your option, any 4 # later version. See the file COPYING for details. 5 6 # This file is vaguely inspired by code from socles, an OpenCL image processing library. 3 7 # 4 # socles is free software: you can redistribute it and/or modify 5 # it under the terms of the GNU General Public License as published by 6 # the Free Software Foundation, either version 3 of the License, or 7 # (at your option) any later version. 8 # 9 # socles is distributed in the hope that it will be useful, 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # GNU General Public License for more details. 13 # 14 # You should have received a copy of the GNU General Public License 15 # along with socles. If not, see <http://www.gnu.org/licenses/>. 8 # Notes: 9 # * we use integer arithmetic by pre-multiplying coefficients by 2*20 10 # and use a fast bit shift to get the result as an 8-bit unsigned char. 11 # * each Y/U/V channel is passed in as a single channel image2d 12 # * we allow downscaling 13 # * the image sampler is passed in (responsibility of the caller to choose the right one) 14 # * we deal with odd sized images gracefully by clamping the output (via runtime checks) 15 # as well as the input (if sampler_t uses CLAMP_TO_EDGE) 16 16 17 17 18 YUV_TO_RGB = {"X" : "1.0",19 "A" : "1.0",20 "R" : "Y + 1.5958 * Cb",21 "G" : "Y - 0.39173*Cr-0.81290*Cb",22 "B" : "Y + 2.017*Cr"18 YUV_TO_RGB = {"X" : [1.0], 19 "A" : [1.0], 20 "R" : [1.0, "*", "Y", "+", 1.5958, "*", "Cb"], 21 "G" : [1.0, "*", "Y", "-", 0.39173, "*", "Cr", "-", 0.8129, "*", "Cb"], 22 "B" : [1.0, "*", "Y", "+", 2.017, "*", "Cr"], 23 23 } 24 24 25 #Cr width div, Cr heigth div, Cb width div, Cb width div 25 def get_RGB_formulae(rgb_channel, multiplier=2**20): 26 #given an RGB channel (R, G or B), return the formulae for it 27 #which uses the named variables Y, Cb (aka U) and Cr (aka V) 28 f = YUV_TO_RGB[rgb_channel] #ie: ["Y", "+", 1.5958, "*", "Cb"] 29 mf = [] 30 for x in f: 31 if type(x)==float: 32 x = int(round(x*multiplier)) #1.5958 -> 1673318 33 mf.append(x) 34 return " ".join([str(x) for x in mf]) 35 36 26 37 YUV_FORMATS = ("YUV444P", "YUV422P", "YUV420P") 27 38 28 39 def indexes_to_rgb_mode(RGB_args): … … 70 81 RGB_args = rgb_mode_to_indexes(rgb_format) 71 82 assert len(RGB_args)==4, "we need 4 RGB components (R,G,B and A or X), not: %s" % RGB_args 72 83 kname = "%s_to_%s" % (yuv_format, indexes_to_rgb_mode(RGB_args)) 73 args = tuple([kname] + [YUV_TO_RGB[c] for c in rgb_format]) 84 rgb_args = [get_RGB_formulae(x) for x in rgb_format] 85 args = tuple([kname] + rgb_args) 74 86 kstr = """ 75 87 __kernel void %s(read_only image2d_t srcY, read_only image2d_t srcU, read_only image2d_t srcV, 76 uint srcw, uint srch, uint w,uint h,88 const uint srcw, const uint srch, const uint w, const uint h, 77 89 const sampler_t sampler, write_only image2d_t dst) { 78 uint gx = get_global_id(0);79 uint gy = get_global_id(1);80 uint srcx = gx*srcw/w;81 uint srcy = gy*srch/h;90 const uint gx = get_global_id(0); 91 const uint gy = get_global_id(1); 92 const uint srcx = gx*srcw/w; 93 const uint srcy = gy*srch/h; 82 94 83 95 if ((gx < w) & (gy < h)) { 84 float4 p;96 uint4 p; 85 97 86 int2 src= (int2)( srcx, srcy );87 float Y = 1.1643 * read_imagef(srcY, sampler, src).s0 - 0.0625;88 float Cr = read_imagef(srcU, sampler, src).s0 - 0.5f;89 float Cb = read_imagef(srcV, sampler, src).s0 - 0.5f;98 const int2 src = (int2)( srcx, srcy ); 99 const int Y = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536; 100 const int Cr = 1048576 * read_imageui(srcU, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288; 101 const int Cb = 1048576 * read_imageui(srcV, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288; 90 102 91 p.s0 = %s;92 p.s1 = %s;93 p.s2 = %s;94 p.s3 = %s;103 p.s0 = convert_uchar_sat_rte(%s>>20); 104 p.s1 = convert_uchar_sat_rte(%s>>20); 105 p.s2 = convert_uchar_sat_rte(%s>>20); 106 p.s3 = convert_uchar_sat_rte(%s>>20); 95 107 96 write_image f(dst, (int2)( gx, gy ), p);108 write_imageui(dst, (int2)( gx, gy ), p); 97 109 } 98 110 } 99 111 """ … … 107 119 RGB_args = rgb_mode_to_indexes(rgb_format) 108 120 assert len(RGB_args)==4, "we need 4 RGB components (R,G,B and A or X), not: %s" % RGB_args 109 121 kname = "%s_to_%s" % (yuv_format, indexes_to_rgb_mode(RGB_args)) 110 args = tuple([kname] + [YUV_TO_RGB[c] for c in rgb_format]*2) 122 rgb_args = [get_RGB_formulae(x) for x in rgb_format] 123 args = tuple([kname] + rgb_args*2) 111 124 kstr = """ 112 125 __kernel void %s(read_only image2d_t srcY, read_only image2d_t srcU, read_only image2d_t srcV, 113 uint srcw, uint srch, uint w,uint h,126 const uint srcw, const uint srch, const uint w, const uint h, 114 127 const sampler_t sampler, write_only image2d_t dst) { 115 uint gx = get_global_id(0);116 uint gy = get_global_id(1);128 const uint gx = get_global_id(0); 129 const uint gy = get_global_id(1); 117 130 118 131 if ((gx*2 < w) & (gy < h)) { 119 float4 p;132 uint4 p; 120 133 121 134 uint srcx = gx*2*srcw/w; 122 uint srcy = gy*srch/h;123 float Y = 1.1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625;124 float Cr = read_imagef(srcU, sampler, (int2)( srcx/2, srcy )).s0 - 0.5f;125 float Cb = read_imagef(srcV, sampler, (int2)( srcx/2, srcy )).s0 - 0.5f;135 const uint srcy = gy*srch/h; 136 int Y = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536; 137 const int Cr = 1048576 * read_imageui(srcU, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288; 138 const int Cb = 1048576 * read_imageui(srcV, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288; 126 139 127 p.s0 = %s;128 p.s1 = %s;129 p.s2 = %s;130 p.s3 = %s;140 p.s0 = convert_uchar_sat_rte(%s>>20); 141 p.s1 = convert_uchar_sat_rte(%s>>20); 142 p.s2 = convert_uchar_sat_rte(%s>>20); 143 p.s3 = convert_uchar_sat_rte(%s>>20); 131 144 132 write_image f(dst, (int2)( gx*2, gy ), p);145 write_imageui(dst, (int2)( gx*2, gy ), p); 133 146 134 147 if (gx*2+1 < w) { 135 148 srcx = (gx*2+1)*srcw/w; 136 Y = 1 .1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625;149 Y = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536; 137 150 138 p.s0 = %s;139 p.s1 = %s;140 p.s2 = %s;141 p.s3 = %s;151 p.s0 = convert_uchar_sat_rte(%s>>20); 152 p.s1 = convert_uchar_sat_rte(%s>>20); 153 p.s2 = convert_uchar_sat_rte(%s>>20); 154 p.s3 = convert_uchar_sat_rte(%s>>20); 142 155 143 write_image f(dst, (int2)( gx*2+1, gy ), p);156 write_imageui(dst, (int2)( gx*2+1, gy ), p); 144 157 } 145 158 } 146 159 } … … 155 168 RGB_args = rgb_mode_to_indexes(rgb_format) 156 169 assert len(RGB_args)==4, "we need 4 RGB components (R,G,B and A or X), not: %s" % RGB_args 157 170 kname = "%s_to_%s" % (yuv_format, indexes_to_rgb_mode(RGB_args)) 158 args = tuple([kname] + [YUV_TO_RGB[c] for c in rgb_format]*4) 171 #convert rgb_format into list of 4 channel values: 172 rgb_args = [get_RGB_formulae(x) for x in rgb_format] 173 args = tuple([kname] + rgb_args*4) 159 174 kstr = """ 160 175 __kernel void %s(read_only image2d_t srcY, read_only image2d_t srcU, read_only image2d_t srcV, 161 uint srcw, uint srch, uint w,uint h,176 const uint srcw, const uint srch, const uint w, const uint h, 162 177 const sampler_t sampler, write_only image2d_t dst) { 163 uint gx = get_global_id(0);164 uint gy = get_global_id(1);178 const uint gx = get_global_id(0); 179 const uint gy = get_global_id(1); 165 180 166 uint x = gx*2;167 uint y = gy*2;181 const uint x = gx*2; 182 const uint y = gy*2; 168 183 if ((x < w) & (y < h)) { 169 float4 p;184 uint4 p; 170 185 171 uint srcx = gx*2*srcw/w; 172 uint srcy = gy*2*srch/h; 173 float Y = 1.1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625; 174 float Cr = read_imagef(srcU, sampler, (int2)( srcx/2, srcy/2 )).s0 - 0.5f; 175 float Cb = read_imagef(srcV, sampler, (int2)( srcx/2, srcy/2 )).s0 - 0.5f; 186 uint srcx = x*srcw/w; 187 uint srcy = y*srch/h; 176 188 177 p.s0 = %s; 178 p.s1 = %s; 179 p.s2 = %s; 180 p.s3 = %s; 189 //Y = 1.1643 * p.s0 - 0.0625 190 //Y*2**20 = 1220857 * v - 65536 191 //Cb*2**20 = 2**20 * v - 2**19; 192 //Cr*2**20 = 2**20 * v - 2**19; 193 int Y = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536; 194 const int Cr = 1048576 * read_imageui(srcU, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288; 195 const int Cb = 1048576 * read_imageui(srcV, sampler, (int2)( srcx/2, srcy/2 )).s0 - 524288; 181 196 182 write_imagef(dst, (int2)( x, y ), p); 197 p.s0 = convert_uchar_sat_rte(%s>>20); 198 p.s1 = convert_uchar_sat_rte(%s>>20); 199 p.s2 = convert_uchar_sat_rte(%s>>20); 200 p.s3 = convert_uchar_sat_rte(%s>>20); 183 201 202 write_imageui(dst, (int2)( x, y ), p); 203 184 204 if (x+1 < w) { 185 srcx = ( gx*2+1)*srcw/w;186 Y = 1 .1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625;205 srcx = (x+1)*srcw/w; 206 Y = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536; 187 207 188 p.s0 = %s;189 p.s1 = %s;190 p.s2 = %s;191 p.s3 = %s;208 p.s0 = convert_uchar_sat_rte(%s>>20); 209 p.s1 = convert_uchar_sat_rte(%s>>20); 210 p.s2 = convert_uchar_sat_rte(%s>>20); 211 p.s3 = convert_uchar_sat_rte(%s>>20); 192 212 193 write_image f(dst, (int2)( x+1, y ), p);213 write_imageui(dst, (int2)( x+1, y ), p); 194 214 } 195 215 196 216 if (y+1 < h) { 197 srcx = gx*2*srcw/w;198 srcy = ( gy*2+1)*srch/h;199 Y = 1 .1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625;217 srcx = x*srcw/w; 218 srcy = (y+1)*srch/h; 219 Y = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536; 200 220 201 p.s0 = %s;202 p.s1 = %s;203 p.s2 = %s;204 p.s3 = %s;221 p.s0 = convert_uchar_sat_rte(%s>>20); 222 p.s1 = convert_uchar_sat_rte(%s>>20); 223 p.s2 = convert_uchar_sat_rte(%s>>20); 224 p.s3 = convert_uchar_sat_rte(%s>>20); 205 225 206 write_image f(dst, (int2)( x, y+1 ), p);226 write_imageui(dst, (int2)( x, y+1 ), p); 207 227 208 228 if (x+1 < w) { 209 srcx = ( gx*2+1)*srcw/w;210 Y = 1 .1643 * read_imagef(srcY, sampler, (int2)( srcx, srcy )).s0 - 0.0625;229 srcx = (x+1)*srcw/w; 230 Y = 1220857 * read_imageui(srcY, sampler, (int2)( srcx, srcy )).s0 - 65536; 211 231 212 p.s0 = %s;213 p.s1 = %s;214 p.s2 = %s;215 p.s3 = %s;232 p.s0 = convert_uchar_sat_rte(%s>>20); 233 p.s1 = convert_uchar_sat_rte(%s>>20); 234 p.s2 = convert_uchar_sat_rte(%s>>20); 235 p.s3 = convert_uchar_sat_rte(%s>>20); 216 236 217 write_image f(dst, (int2)( x+1, y+1 ), p);237 write_imageui(dst, (int2)( x+1, y+1 ), p); 218 238 } 219 239 } 220 240 } … … 236 256 return YUV_to_RGB_KERNELS 237 257 238 258 259 RGB_TO_YUV = {"Y" : [0.257, "*", "R", "+", 0.504, "*", "G", "+", 0.098, "*", "B", "+", 16], 260 "U" : [-0.148, "*", "R", "-", 0.291, "*", "G", "+", 0.439, "*", "B", "+", 128], 261 "V" : [0.439, "*", "R", "-", 0.368, "*", "G", "-", 0.071, "*", "B", "+", 128], 262 } 239 263 264 def get_YUV_formulae(yuv_channel, fmult=2**20, imult=2**20): 265 #given an YUV channel (Y, U or V), return the formulae for it 266 #which uses the named variables R, G and B 267 f = RGB_TO_YUV[yuv_channel] #ie: ["Y", "+", 1.5958, "*", "Cb"] 268 mf = [] 269 for x in f: 270 if type(x)==float: 271 x = int(round(x*fmult)) #-0.148 -> -155189 272 if type(x) in (float, int): 273 x = int(round(x*imult)) #16 -> 16777216 274 mf.append(x) 275 return " ".join([str(x) for x in mf]) 240 276 277 def get_YUV(yuv_channel, R, G, B, exp=0): 278 p = 20 279 f = get_YUV_formulae(yuv_channel, fmult=2**p, imult=2**(p+exp)) 280 #substitute R, G and B: 281 f = f.replace("R", R).replace("G", G).replace("B", B) 282 return "(%s)>>%s" % (f, p+exp) 283 241 284 def gen_rgb_to_yuv444p_kernel(rgb_mode): 242 285 RGB_args = rgb_indexes(rgb_mode) 243 #kernel args: R, G, B are used 3 times each: 286 R = RGB_args[0] #ie: 0 287 G = RGB_args[1] #ie: 1 288 B = RGB_args[2] #ie: 2 244 289 kname = "%s_to_YUV444P" % indexes_to_rgb_mode(RGB_args) 245 args = tuple([kname]+RGB_args*3) 290 #kernel args: 291 args = [kname] 292 #consts: 293 pR = "p.s%s" % R 294 pG = "p.s%s" % G 295 pB = "p.s%s" % B 296 #one U pixel with the sum: 297 Y = get_YUV("Y", pR, pG, pB) 298 U = get_YUV("U", pR, pG, pB) 299 V = get_YUV("V", pR, pG, pB) 300 args += [Y, U, V] 246 301 247 302 kstr = """ 248 303 __kernel void %s(read_only image2d_t src, 249 uint srcw, uint srch, uint w,uint h,304 const uint srcw, const uint srch, const uint w, const uint h, 250 305 const sampler_t sampler, 251 global uchar *dstY, uint strideY,252 global uchar *dstU, uint strideU,253 global uchar *dstV, uint strideV) {254 uint gx = get_global_id(0);255 uint gy = get_global_id(1);306 global uchar *dstY, const uint strideY, 307 global uchar *dstU, const uint strideU, 308 global uchar *dstV, const uint strideV) { 309 const uint gx = get_global_id(0); 310 const uint gy = get_global_id(1); 256 311 257 312 if ((gx < w) & (gy < h)) { 258 uint4 p = read_imageui(src, sampler, (int2)( (gx*srcw)/w, (gy*srch)/h ));313 const uint4 p = read_imageui(src, sampler, (int2)( (gx*srcw)/w, (gy*srch)/h )); 259 314 260 float Y = (0.257 * p.s%s + 0.504 * p.s%s + 0.098 * p.s%s + 16); 261 float U = (-0.148 * p.s%s - 0.291 * p.s%s + 0.439 * p.s%s + 128); 262 float V = (0.439 * p.s%s - 0.368 * p.s%s - 0.071 * p.s%s + 128); 263 264 dstY[gx + gy*strideY] = convert_uchar_rte(Y); 265 dstU[gx + gy*strideU] = convert_uchar_rte(U); 266 dstV[gx + gy*strideV] = convert_uchar_rte(V); 315 dstY[gx + gy*strideY] = convert_uchar_sat_rte(%s); 316 dstU[gx + gy*strideU] = convert_uchar_sat_rte(%s); 317 dstV[gx + gy*strideV] = convert_uchar_sat_rte(%s); 267 318 } 268 319 } 269 320 """ 270 return kname, kstr % args321 return kname, kstr % tuple(args) 271 322 272 323 def gen_rgb_to_yuv422p_kernel(rgb_mode): 273 324 RGB_args = rgb_indexes(rgb_mode) 274 #kernel args: R, G, B are used 6 times each: 325 R = RGB_args[0] #ie: 0 326 G = RGB_args[1] #ie: 1 327 B = RGB_args[2] #ie: 2 275 328 kname = "%s_to_YUV422P" % indexes_to_rgb_mode(RGB_args) 276 args = tuple([kname]+RGB_args*6) 329 #kernel args: 330 args = [kname] 331 #2 Y pixels: 332 for i in range(2): 333 Y = get_YUV("Y", "p[%s].s%s" % (i, R), "p[%s].s%s" % (i, G), "p[%s].s%s" % (i, B)) 334 args.append(Y) 335 #consts: 336 RR = "+".join(["p[%s].s%s" % (i, R) for i in range(2)]) 337 GG = "+".join(["p[%s].s%s" % (i, G) for i in range(2)]) 338 BB = "+".join(["p[%s].s%s" % (i, B) for i in range(2)]) 339 #one U pixel with the sum: 340 U = get_YUV("U", "R", "G", "B", exp=1) 341 V = get_YUV("V", "R", "G", "B", exp=1) 342 args += [RR, GG, BB, U, V] 277 343 278 344 kstr = """ 279 345 __kernel void %s(read_only image2d_t src, … … 282 348 global uchar *dstY, uint strideY, 283 349 global uchar *dstU, uint strideU, 284 350 global uchar *dstV, uint strideV) { 285 uint gx = get_global_id(0);286 uint gy = get_global_id(1);351 const uint gx = get_global_id(0); 352 const uint gy = get_global_id(1); 287 353 288 354 if ((gx*2 < w) & (gy < h)) { 289 355 uint srcx = gx*2*srcw/w; 290 uint srcy = gy*srch/h; 291 uint4 p1 = read_imageui(src, sampler, (int2)( srcx, srcy )); 292 uint4 p2 = p1; 356 const uint srcy = gy*srch/h; 357 uint4 p[2]; 358 p[0] = read_imageui(src, sampler, (int2)( srcx, srcy )); 359 p[1] = p[0]; 293 360 294 361 //write up to 2 Y pixels: 295 float Y1 = (0.257 * p1.s%s + 0.504 * p1.s%s + 0.098 * p1.s%s + 16); 296 uint i = gx*2 + gy*strideY; 297 dstY[i] = convert_uchar_rte(Y1); 362 const uint i = gx*2 + gy*strideY; 363 dstY[i] = convert_uchar_sat_rte(%s); 298 364 //we process two pixels at a time 299 365 //if the source width is odd, this destination pixel may not exist (right edge of picture) 300 366 //(we only read it via CLAMP_TO_EDGE to calculate U and V, which do exist) 301 367 if (gx*2+1 < w) { 302 368 srcx = (gx*2+1)*srcw/w; 303 p2 = read_imageui(src, sampler, (int2)( srcx, srcy )); 304 float Y2 = (0.257 * p2.s%s + 0.504 * p2.s%s + 0.098 * p2.s%s + 16); 305 dstY[i+1] = convert_uchar_rte(Y2); 369 p[1] = read_imageui(src, sampler, (int2)( srcx, srcy )); 370 dstY[i+1] = convert_uchar_sat_rte(%s); 306 371 } 307 372 373 const int R = %s; 374 const int G = %s; 375 const int B = %s; 308 376 //write 1 U pixel: 309 float U1 = (-0.148 * p1.s%s - 0.291 * p1.s%s + 0.439 * p1.s%s + 128); 310 float U2 = (-0.148 * p2.s%s - 0.291 * p2.s%s + 0.439 * p2.s%s + 128); 311 //some algorithms just ignore U2, we do not and use an average 312 //dstU[gx + gy*strideU] = convert_uchar_rte(U1); 313 dstU[gx + gy*strideU] = convert_uchar_rte((U1+U2)/2.0); 314 377 dstU[gx + gy*strideU] = convert_uchar_sat_rte(%s); 315 378 //write 1 V pixel: 316 float V1 = (0.439 * p1.s%s - 0.368 * p1.s%s - 0.071 * p1.s%s + 128); 317 float V2 = (0.439 * p2.s%s - 0.368 * p2.s%s - 0.071 * p2.s%s + 128); 318 //some algorithms just ignore V2, we do not and use an average 319 //dstV[gx + gy*strideV] = convert_uchar_rte(V1); 320 dstV[gx + gy*strideV] = convert_uchar_rte((V1+V2)/2.0); 379 dstV[gx + gy*strideV] = convert_uchar_sat_rte(%s); 321 380 } 322 381 } 323 382 """ 324 return kname, kstr % args383 return kname, kstr % tuple(args) 325 384 326 327 385 def gen_rgb_to_yuv420p_kernel(rgb_mode): 328 RGB_args = rgb_indexes(rgb_mode) 329 #kernel args: R, G, B are used 12 times each: 330 kname = "%s_to_YUV420P" % indexes_to_rgb_mode(RGB_args) 331 args = tuple([kname]+RGB_args*12) 332 386 RGB_args = rgb_indexes(rgb_mode) #BGRX -> [2, 1, 0] 387 R = RGB_args[0] #ie: 0 388 G = RGB_args[1] #ie: 1 389 B = RGB_args[2] #ie: 2 390 kname = "%s_to_YUV420P" % indexes_to_rgb_mode(RGB_args) # [0, 1, 2] -> RGB 391 #kernel args: 392 args = [kname] 393 #4 Y pixels: 394 for i in range(4): 395 Y = get_YUV("Y", "p[%s].s%s" % (i, R), "p[%s].s%s" % (i, G), "p[%s].s%s" % (i, B)) 396 #ie: (roundint(0.257*2**20) * p[i].s2 + roundint(0.504*2**20) * p[i].s1 + roundint(0.098*2**20) * p[i].s0 + 16*2*20)>>20 397 args.append(Y) 398 #consts: 399 RRRR = "+".join(["p[%s].s%s" % (i, R) for i in range(4)]) 400 GGGG = "+".join(["p[%s].s%s" % (i, G) for i in range(4)]) 401 BBBB = "+".join(["p[%s].s%s" % (i, B) for i in range(4)]) 402 #one U pixel with the sum: 403 U = get_YUV("U", "R", "G", "B", exp=2) 404 V = get_YUV("V", "R", "G", "B", exp=2) 405 args += [RRRR, GGGG, BBBB, U, V] 333 406 kstr = """ 334 407 __kernel void %s(read_only image2d_t src, 335 uint srcw, uint srch, uint w,uint h,408 const uint srcw, const uint srch, const uint w, const uint h, 336 409 const sampler_t sampler, 337 global uchar *dstY, uint strideY,338 global uchar *dstU, uint strideU,339 global uchar *dstV, uint strideV) {340 uint gx = get_global_id(0);341 uint gy = get_global_id(1);410 global uchar *dstY, const uint strideY, 411 global uchar *dstU, const uint strideU, 412 global uchar *dstV, const uint strideV) { 413 const uint gx = get_global_id(0); 414 const uint gy = get_global_id(1); 342 415 343 416 if ((gx*2 < w) & (gy*2 < h)) { 344 417 uint srcx = gx*2*srcw/w; 345 418 uint srcy = gy*2*srch/h; 346 uint4 p1 = read_imageui(src, sampler, (int2)( srcx, srcy )); 347 uint4 p2 = p1; 348 uint4 p3 = p1; 349 uint4 p4 = p1; 419 uint4 p[4]; 420 p[0] = read_imageui(src, sampler, (int2)( srcx, srcy )); 421 p[1] = p[0]; 422 p[2] = p[0]; 423 p[3] = p[0]; 350 424 351 //write up to 4 Y pixels:352 float Y1 = (0.257 * p1.s%s + 0.504 * p1.s%s + 0.098 * p1.s%s + 16);353 //same logic as 422P for missing pixels:354 425 uint i = gx*2 + gy*2*strideY; 355 dstY[i] = convert_uchar_ rte(Y1);426 dstY[i] = convert_uchar_sat_rte(%s); 356 427 if (gx*2+1 < w) { 357 428 srcx = (gx*2+1)*srcw/w; 358 p2 = read_imageui(src, sampler, (int2)( srcx, srcy )); 359 float Y2 = (0.257 * p2.s%s + 0.504 * p2.s%s + 0.098 * p2.s%s + 16); 360 dstY[i+1] = convert_uchar_rte(Y2); 429 p[1] = read_imageui(src, sampler, (int2)( srcx, srcy )); 430 dstY[i+1] = convert_uchar_sat_rte(%s); 361 431 } 362 432 if (gy*2+1 < h) { 363 433 i += strideY; 364 434 srcx = gx*2*srcw/w; 365 435 srcy = (gy*2+1)*srch/h; 366 p3 = read_imageui(src, sampler, (int2)( srcx, srcy )); 367 float Y3 = (0.257 * p3.s%s + 0.504 * p3.s%s + 0.098 * p3.s%s + 16); 368 dstY[i] = convert_uchar_rte(Y3); 436 p[2] = read_imageui(src, sampler, (int2)( srcx, srcy )); 437 dstY[i] = convert_uchar_sat_rte(%s); 369 438 if (gx*2+1 < w) { 370 439 srcx = (gx*2+1)*srcw/w; 371 p4 = read_imageui(src, sampler, (int2)( srcx, srcy )); 372 float Y4 = (0.257 * p4.s%s + 0.504 * p4.s%s + 0.098 * p4.s%s + 16); 373 dstY[i+1] = convert_uchar_rte(Y4); 440 p[3] = read_imageui(src, sampler, (int2)( srcx, srcy )); 441 dstY[i+1] = convert_uchar_sat_rte(%s); 374 442 } 375 443 } 376 444 445 const int R = %s; 446 const int G = %s; 447 const int B = %s; 377 448 //write 1 U pixel: 378 float U1 = (-0.148 * p1.s%s - 0.291 * p1.s%s + 0.439 * p1.s%s + 128); 379 float U2 = (-0.148 * p2.s%s - 0.291 * p2.s%s + 0.439 * p2.s%s + 128); 380 float U3 = (-0.148 * p3.s%s - 0.291 * p3.s%s + 0.439 * p3.s%s + 128); 381 float U4 = (-0.148 * p4.s%s - 0.291 * p4.s%s + 0.439 * p4.s%s + 128); 382 dstU[gx + gy*strideU] = convert_uchar_rte((U1+U2+U3+U4)/4.0); 383 449 dstU[gx + gy*strideU] = convert_uchar_sat_rte(%s); 384 450 //write 1 V pixel: 385 float V1 = (0.439 * p1.s%s - 0.368 * p1.s%s - 0.071 * p1.s%s + 128); 386 float V2 = (0.439 * p2.s%s - 0.368 * p2.s%s - 0.071 * p2.s%s + 128); 387 float V3 = (0.439 * p3.s%s - 0.368 * p3.s%s - 0.071 * p3.s%s + 128); 388 float V4 = (0.439 * p4.s%s - 0.368 * p4.s%s - 0.071 * p4.s%s + 128); 389 dstV[gx + gy*strideV] = convert_uchar_rte((V1+V2+V3+V4)/4.0); 451 dstV[gx + gy*strideV] = convert_uchar_sat_rte(%s); 390 452 } 391 453 } 392 454 """ 393 return kname, kstr % args455 return kname, kstr % tuple(args) 394 456 395 457 396 458 RGB_to_YUV_generators = {