Ticket #370: csc-with-stride.patch
File csc-with-stride.patch, 19.1 KB (added by , 8 years ago) |
---|
-
xpra/codecs/codec_constants.py
4 4 # Xpra is released under the terms of the GNU GPL v2, or, at your option, any 5 5 # later version. See the file COPYING for details. 6 6 7 #this table is used to convert a dimension (width x height) 8 #into a list of plane sizes 7 9 PIXEL_SUBSAMPLING = { 10 "NV12" : ((1, 1), (1, 2)), #2 planes: "Y" and "UV" packed together (U and V are subsampled by 2 in both dimensions - but packed the width is "unchanged") 8 11 "YUV420P" : ((1, 1), (2, 2), (2, 2)), 9 12 "YUV422P" : ((1, 1), (2, 1), (2, 1)), 10 13 "YUV444P" : ((1, 1), (1, 1), (1, 1)), … … 48 51 quality=100, speed=100, 49 52 setup_cost=50, cpu_cost=100, gpu_cost=0, 50 53 min_w=1, min_h=1, max_w=4*1024, max_h=4*1024, max_pixels=4*1024*4*1024, 54 width_stride_rounding = 1, height_stride_rounding = 1, 51 55 can_scale=False, 52 56 width_mask=0xFFFF, height_mask=0xFFFF): 53 57 self.codec_class = codec_class #ie: xpra.codecs.enc_x264.encoder.Encoder … … 61 65 self.min_h = min_h 62 66 self.max_w = max_w 63 67 self.max_h = max_h 68 self.width_stride_rounding = width_stride_rounding 69 self.height_stride_rounding = height_stride_rounding 64 70 self.max_pixels = max_pixels 65 71 self.width_mask = width_mask 66 72 self.height_mask = height_mask -
xpra/codecs/csc_nvcuda/colorspace_converter.py
260 260 self.dst_width = 0 261 261 self.dst_height = 0 262 262 self.dst_format = "" 263 self.width_stride_rounding = 0 264 self.height_stride_rounding = 0 263 265 self.time = 0 264 266 self.frames = 0 265 267 self.kernel_function = None 266 268 self.context = None 267 269 268 270 def init_context(self, src_width, src_height, src_format, 269 dst_width, dst_height, dst_format, speed=100): #@DuplicatedSignature 271 dst_width, dst_height, dst_format, 272 width_stride_rounding, height_stride_rounding, 273 speed=100): #@DuplicatedSignature 270 274 validate_in_out(src_format, dst_format) 271 275 init_context() 272 276 self.src_width = src_width … … 275 279 self.dst_width = dst_width 276 280 self.dst_height = dst_height 277 281 self.dst_format = dst_format 282 self.width_stride_rounding = width_stride_rounding 283 self.height_stride_rounding = height_stride_rounding 278 284 self.context = context 279 285 k = (src_format, dst_format) 280 286 npp_fn = COLORSPACES_MAP.get(k) … … 452 458 driver.memcpy_dtoh_async(pixels, out_buf, stream) 453 459 else: 454 460 #we don't want the crazy large GPU padding, so we do it ourselves: 455 stride = width*4461 stride = roundup(width*4, self.width_stride_rounding) 456 462 pixels = driver.pagelocked_empty(stride*height, dtype=numpy.byte) 457 463 copy = driver.Memcpy2D() 458 464 copy.set_src_device(out_buf) … … 508 514 out_sizes = [] 509 515 for i in range(3): 510 516 x_div, y_div = divs[i] 511 out_stride = roundup(width/x_div, 4)512 out_height = roundup(height/y_div, 2)517 out_stride = roundup(width/x_div, self.width_stride_rounding) 518 out_height = roundup(height/y_div, self.height_stride_rounding) 513 519 out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4) 514 520 out_bufs.append(out_buf) 515 521 out_strides.append(out_stride) -
xpra/codecs/csc_opencl/colorspace_converter.py
159 159 #log.info("has_same_channels(%s, %s)=%s (%s - %s)", src, dst, scheck, dcheck, len(scheck)==0 and len(dcheck)==0) 160 160 return len(scheck)==0 and len(dcheck)==0 161 161 162 KERNELS_DEFS = {} 162 163 163 def gen_yuv_to_rgb(): 164 164 global context 165 165 from xpra.codecs.csc_opencl.opencl_kernels import gen_yuv_to_rgb_kernels, rgb_mode_to_indexes, indexes_to_rgb_mode … … 316 316 return RGB_to_YUV_KERNELS 317 317 318 318 319 KERNELS_DEFS = {} 319 320 def gen_kernels(): 320 321 """ 321 322 The code here is complicated by the fact that we don't know … … 412 413 self.dst_width = 0 413 414 self.dst_height = 0 414 415 self.dst_format = "" 416 self.width_stride_rounding = 0 417 self.height_stride_rounding = 0 415 418 self.time = 0 416 419 self.frames = 0 417 420 self.queue = None … … 421 424 self.kernel_function_name = None 422 425 423 426 def init_context(self, src_width, src_height, src_format, 424 dst_width, dst_height, dst_format, csc_speed=100): #@DuplicatedSignature 427 dst_width, dst_height, dst_format, 428 width_stride_rounding, height_stride_rounding, 429 csc_speed=100): #@DuplicatedSignature 425 430 global context 426 431 debug("init_context%s", (src_width, src_height, src_format, dst_width, dst_height, dst_format, csc_speed)) 427 432 validate_in_out(src_format, dst_format) … … 431 436 self.dst_width = dst_width 432 437 self.dst_height = dst_height 433 438 self.dst_format = dst_format 439 self.width_stride_rounding = width_stride_rounding 440 self.height_stride_rounding = height_stride_rounding 434 441 self.queue = pyopencl.CommandQueue(context) 435 442 #sampling type: 436 443 if self.src_width>self.dst_width and self.src_height>self.dst_height: … … 557 564 558 565 #output image: 559 566 oformat = pyopencl.ImageFormat(self.channel_order, pyopencl.channel_type.UNORM_INT8) 560 oimage = pyopencl.Image(context, mem_flags.WRITE_ONLY, oformat, shape=(self.dst_width, self.dst_height)) 567 outstride = roundup(self.dst_width, self.width_stride_rounding) 568 outheight = roundup(self.dst_height, self.height_stride_rounding) 569 oimage = pyopencl.Image(context, mem_flags.WRITE_ONLY, oformat, shape=(outstride, outheight)) 561 570 562 571 iformat = pyopencl.ImageFormat(pyopencl.channel_order.R, pyopencl.channel_type.UNSIGNED_INT8) 563 572 for i in range(3): … … 571 580 iimage = pyopencl.Image(context, flags, iformat, shape=shape, hostbuf=plane) 572 581 kernelargs.append(iimage) 573 582 583 #FIXME: pass out stride! 574 584 kernelargs += [numpy.int32(self.src_width), numpy.int32(self.src_height), 575 585 numpy.int32(self.dst_width), numpy.int32(self.dst_height), 576 586 self.sampler, oimage] … … 582 592 kend = time.time() 583 593 debug("%s took %.1fms", self.kernel_function, 1000.0*(kend-kstart)) 584 594 585 out_array = numpy.empty( self.dst_width*self.dst_height*4, dtype=numpy.byte)595 out_array = numpy.empty(outstride*outheight*4, dtype=numpy.byte) 586 596 pyopencl.enqueue_read_image(self.queue, oimage, origin=(0, 0), region=(self.dst_width, self.dst_height), hostbuf=out_array, is_blocking=True) 587 597 self.queue.finish() 588 598 debug("readback using %s took %.1fms", CHANNEL_ORDER_TO_STR.get(self.channel_order), 1000.0*(time.time()-kend)) … … 607 617 #adjust work dimensions for subsampling: 608 618 #(we process N pixels at a time in each dimension) 609 619 divs = get_subsampling_divs(self.dst_format) 620 assert len(divs) in ImageWrapper.PLANE_OPTIONS, "invalid number of planes: %s" % len(divs) 610 621 wwidth = dimdiv(self.dst_width, max([x_div for x_div, _ in divs])) 611 622 wheight = dimdiv(self.dst_height, max([y_div for _, y_div in divs])) 612 623 globalWorkSize, localWorkSize = self.get_work_sizes(wwidth, wheight) … … 635 646 strides = [] 636 647 out_buffers = [] 637 648 out_sizes = [] 638 for i in range(3): 649 nplanes = len(divs) 650 for i in range(nplanes): 639 651 x_div, y_div = divs[i] 640 p_stride = roundup(self.dst_width / x_div, max(2, localWorkSize[0] ))641 p_height = roundup(self.dst_height / y_div, 2 )652 p_stride = roundup(self.dst_width / x_div, max(2, localWorkSize[0], self.width_stride_rounding)) 653 p_height = roundup(self.dst_height / y_div, 2, self.height_stride_rounding) 642 654 p_size = p_stride * p_height 643 655 #debug("output buffer for channel %s: stride=%s, height=%s, size=%s", i, p_stride, p_height, p_size) 644 656 out_buf = pyopencl.Buffer(context, mem_flags.WRITE_ONLY, p_size) … … 656 668 #read back: 657 669 pixels = [] 658 670 read_events = [] 659 for i in range( 3):671 for i in range(nplanes): 660 672 out_array = numpy.empty(out_sizes[i], dtype=numpy.byte) 661 673 pixels.append(out_array.data) 662 674 read = pyopencl.enqueue_read_buffer(self.queue, out_buffers[i], out_array, is_blocking=False) … … 667 679 self.queue.finish() 668 680 readend = time.time() 669 681 debug("wait for read events took %.1fms", 1000.0*(readend-readstart)) 670 return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes= ImageWrapper._3_PLANES)682 return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=nplanes) -
xpra/codecs/csc_opencl/opencl_kernels.py
393 393 return kname, kstr % args 394 394 395 395 396 def gen_rgb_to_nv12_kernel(rgb_mode): 397 RGB_args = rgb_indexes(rgb_mode) 398 #kernel args: R, G, B are used 12 times each: 399 kname = "%s_to_nv12" % indexes_to_rgb_mode(RGB_args) 400 args = tuple([kname]+RGB_args*12) 401 402 kstr = """ 403 __kernel void %s(read_only image2d_t src, 404 uint srcw, uint srch, uint w, uint h, 405 const sampler_t sampler, 406 global uchar *dstY, uint strideY, 407 global uchar *dstUV, uint strideUV) { 408 uint gx = get_global_id(0); 409 uint gy = get_global_id(1); 410 411 if ((gx*2 < w) & (gy*2 < h)) { 412 uint srcx = gx*2*srcw/w; 413 uint srcy = gy*2*srch/h; 414 uint4 p1 = read_imageui(src, sampler, (int2)( srcx, srcy )); 415 uint4 p2 = p1; 416 uint4 p3 = p1; 417 uint4 p4 = p1; 418 419 //write up to 4 Y pixels: 420 float Y1 = (0.257 * p1.s%s + 0.504 * p1.s%s + 0.098 * p1.s%s + 16); 421 //same logic as 422P for missing pixels: 422 uint i = gx*2 + gy*2*strideY; 423 dstY[i] = convert_uchar_rte(Y1); 424 if (gx*2+1 < w) { 425 srcx = (gx*2+1)*srcw/w; 426 p2 = read_imageui(src, sampler, (int2)( srcx, srcy )); 427 float Y2 = (0.257 * p2.s%s + 0.504 * p2.s%s + 0.098 * p2.s%s + 16); 428 dstY[i+1] = convert_uchar_rte(Y2); 429 } 430 if (gy*2+1 < h) { 431 i += strideY; 432 srcx = gx*2*srcw/w; 433 srcy = (gy*2+1)*srch/h; 434 p3 = read_imageui(src, sampler, (int2)( srcx, srcy )); 435 float Y3 = (0.257 * p3.s%s + 0.504 * p3.s%s + 0.098 * p3.s%s + 16); 436 dstY[i] = convert_uchar_rte(Y3); 437 if (gx*2+1 < w) { 438 srcx = (gx*2+1)*srcw/w; 439 p4 = read_imageui(src, sampler, (int2)( srcx, srcy )); 440 float Y4 = (0.257 * p4.s%s + 0.504 * p4.s%s + 0.098 * p4.s%s + 16); 441 dstY[i+1] = convert_uchar_rte(Y4); 442 } 443 } 444 uint UVpos = gy*strideUV + gx*2; 445 446 //write 1 U pixel: 447 float U1 = (-0.148 * p1.s%s - 0.291 * p1.s%s + 0.439 * p1.s%s + 128); 448 float U2 = (-0.148 * p2.s%s - 0.291 * p2.s%s + 0.439 * p2.s%s + 128); 449 float U3 = (-0.148 * p3.s%s - 0.291 * p3.s%s + 0.439 * p3.s%s + 128); 450 float U4 = (-0.148 * p4.s%s - 0.291 * p4.s%s + 0.439 * p4.s%s + 128); 451 dstUV[UVpos] = convert_uchar_rte((U1+U2+U3+U4)/4.0); 452 453 //write 1 V pixel: 454 float V1 = (0.439 * p1.s%s - 0.368 * p1.s%s - 0.071 * p1.s%s + 128); 455 float V2 = (0.439 * p2.s%s - 0.368 * p2.s%s - 0.071 * p2.s%s + 128); 456 float V3 = (0.439 * p3.s%s - 0.368 * p3.s%s - 0.071 * p3.s%s + 128); 457 float V4 = (0.439 * p4.s%s - 0.368 * p4.s%s - 0.071 * p4.s%s + 128); 458 dstUV[UVpos + 1] = convert_uchar_rte((V1+V2+V3+V4)/4.0); 459 } 460 } 461 """ 462 return kname, kstr % args 463 464 396 465 RGB_to_YUV_generators = { 397 466 "YUV444P" : gen_rgb_to_yuv444p_kernel, 398 467 "YUV422P" : gen_rgb_to_yuv422p_kernel, 399 468 "YUV420P" : gen_rgb_to_yuv420p_kernel, 469 "NV12" : gen_rgb_to_nv12_kernel, 400 470 } 401 471 402 472 def gen_rgb_to_yuv_kernels(rgb_mode="RGBX", yuv_modes=YUV_FORMATS): -
xpra/codecs/csc_swscale/colorspace_converter.pyx
81 81 COLORSPACES = [] 82 82 #keeping this array in scope ensures the strings don't go away! 83 83 FORMAT_OPTIONS = [ 84 #("AV_PIX_FMT_NV12", (1, 1, 0, 0), (1, 0.5, 0, 0), "NV12"),84 ("AV_PIX_FMT_NV12", (1, 1, 0, 0), (1, 0.5, 0, 0), "NV12"), 85 85 ("AV_PIX_FMT_RGB24", (3, 0, 0, 0), (1, 0, 0, 0), "RGB" ), 86 86 ("AV_PIX_FMT_BGR24", (3, 0, 0, 0), (1, 0, 0, 0), "BGR" ), 87 87 ("AV_PIX_FMT_0RGB", (4, 0, 0, 0), (1, 0, 0, 0), "XRGB" ), … … 238 238 cdef int buffer_size 239 239 240 240 def init_context(self, int src_width, int src_height, src_format, 241 int dst_width, int dst_height, dst_format, int speed=100): #@DuplicatedSignature 241 int dst_width, int dst_height, dst_format, 242 width_stride_rounding, height_stride_rounding, 243 int speed=100): #@DuplicatedSignature 242 244 debug("swscale.ColorspaceConverter.init_context%s", (src_width, src_height, src_format, dst_width, dst_height, dst_format, speed)) 243 245 cdef CSCPixelFormat src 244 246 cdef CSCPixelFormat dst … … 258 260 self.buffer_size = 0 259 261 for i in range(4): 260 262 self.out_height[i] = (int) (dst_height * dst.height_mult[i]) 261 self.out_stride[i] = roundup((int) (dst_width * dst.width_mult[i]), 4) 262 #add one extra line to height so we can read a full rowstride 263 #no matter where we start to read on the last line. 264 #MEMALIGN may be redundant here but it is very cheap 265 if dst_format=="NV12" and i==0: 266 #no padding: packed UV plane follows Y plane 267 self.out_size[i] = self.out_stride[i] * self.out_height[i] 268 else: 269 self.out_size[i] = pad(self.out_stride[i] * (self.out_height[i]+1)) 263 self.out_stride[i] = roundup((int) (dst_width * dst.width_mult[i]), self.width_stride_rounding) 264 #ensure we always add at least one extra line to height via roundup 265 #so we can safely read a full rowstride at a time 266 #no matter where we start to read on the last line of the output image 267 padded_height = roundup(self.out_height[i]+1, self.height_stride_rounding) 268 self.out_size[i] = pad(self.out_stride[i] * padded_height) 270 269 self.buffer_size += self.out_size[i] 271 270 debug("buffer size=%s", self.buffer_size) 272 271 … … 274 273 self.src_height = src_height 275 274 self.dst_width = dst_width 276 275 self.dst_height = dst_height 276 self.width_stride_rounding = width_stride_rounding 277 self.height_stride_rounding = height_stride_rounding 277 278 278 279 self.flags = get_swscale_flags(speed) 279 280 self.time = 0 … … 382 383 csci = CSCImage() #keep a reference to memory for cleanup 383 384 for i in range(4): 384 385 csci.set_plane(i, NULL) 385 if self.dst_format.endswith("P") :386 if self.dst_format.endswith("P") or str(self.dst_format)=="NV12": 386 387 #planar mode, assume 3 planes: 387 388 oplanes = ImageWrapper._3_PLANES 389 if str(self.dst_format)=="NV12": 390 oplanes = ImageWrapper._2_PLANES 388 391 out = [] 389 392 strides = [] 390 for i in range( 3):393 for i in range(oplanes): 391 394 if self.out_stride[i]>0 and output_image[i]!=NULL: 392 395 stride = self.out_stride[i] 393 plane = PyBuffer_FromMemory(<void *>output_image[i], self.out_ height[i] * self.out_stride[i])396 plane = PyBuffer_FromMemory(<void *>output_image[i], self.out_size[i]) 394 397 else: 395 398 stride = 0 396 399 plane = None 397 400 csci.set_plane(i, output_image[i]) 398 401 out.append(plane) 399 402 strides.append(stride) 400 elif str(self.dst_format)=="NV12":401 #Y plane, followed by U and V packed402 oplanes = ImageWrapper.PACKED403 strides = self.out_stride[0]404 out = PyBuffer_FromMemory(<void *>output_image[0], self.buffer_size)405 csci.set_plane(0, output_image[0])406 403 else: 407 404 #assume no planes, plain RGB packed pixels: 408 405 oplanes = ImageWrapper.PACKED 409 406 strides = self.out_stride[0] 410 out = PyBuffer_FromMemory(<void *>output_image[0], self.out_ height[0] * self.out_stride[0])407 out = PyBuffer_FromMemory(<void *>output_image[0], self.out_size[0]) 411 408 csci.set_plane(0, output_image[0]) 412 409 elapsed = time.time()-start 413 410 debug("%s took %.1fms", self, 1000.0*elapsed) -
xpra/codecs/image_wrapper.py
8 8 class ImageWrapper(object): 9 9 10 10 PACKED = 0 11 _2_PLANES = 2 11 12 _3_PLANES = 3 12 13 _4_PLANES = 4 13 14 PLANE_OPTIONS = (PACKED, _3_PLANES, _4_PLANES) 14 15 PLANE_NAMES = {PACKED : "PACKED", 16 _2_PLANES : "2_PLANES", 15 17 _3_PLANES : "3_PLANES", 16 18 _4_PLANES : "4_PLANES"} 17 19 -
xpra/codecs/nvenc/encoder.pyx
1024 1024 return codec_spec(Encoder, codec_type=get_type(), encoding=encoding, 1025 1025 quality=60, setup_cost=100, cpu_cost=10, gpu_cost=100, 1026 1026 min_w=2, min_h=2, max_w=4096, max_h=4096, 1027 width_mask=0xFFFE, height_mask=0xFFFE) 1027 width_mask=0xFFFE, height_mask=0xFFFE, 1028 width_stride_rounding=32, height_stride_rounding=32) 1028 1029 1029 1030 1030 1031 def get_version():