xpra icon
Bug tracker and wiki

This bug tracker and wiki are being discontinued
please use https://github.com/Xpra-org/xpra instead.


Ticket #370: csc-with-stride.patch

File csc-with-stride.patch, 19.1 KB (added by Antoine Martin, 8 years ago)

abandoned work on adding stride attributes to csc so nvenc can specify the padding to 32 generically

  • xpra/codecs/codec_constants.py

     
    44# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
    55# later version. See the file COPYING for details.
    66
     7#this table is used to convert a dimension (width x height)
     8#into a list of plane sizes
    79PIXEL_SUBSAMPLING = {
     10         "NV12"      : ((1, 1), (1, 2)),            #2 planes: "Y" and "UV" packed together (U and V are subsampled by 2 in both dimensions - but packed the width is "unchanged")
    811         "YUV420P"   : ((1, 1), (2, 2), (2, 2)),
    912         "YUV422P"   : ((1, 1), (2, 1), (2, 1)),
    1013         "YUV444P"   : ((1, 1), (1, 1), (1, 1)),
     
    4851                    quality=100, speed=100,
    4952                    setup_cost=50, cpu_cost=100, gpu_cost=0,
    5053                    min_w=1, min_h=1, max_w=4*1024, max_h=4*1024, max_pixels=4*1024*4*1024,
     54                    width_stride_rounding = 1, height_stride_rounding = 1,
    5155                    can_scale=False,
    5256                    width_mask=0xFFFF, height_mask=0xFFFF):
    5357        self.codec_class = codec_class          #ie: xpra.codecs.enc_x264.encoder.Encoder
     
    6165        self.min_h = min_h
    6266        self.max_w = max_w
    6367        self.max_h = max_h
     68        self.width_stride_rounding = width_stride_rounding
     69        self.height_stride_rounding = height_stride_rounding
    6470        self.max_pixels = max_pixels
    6571        self.width_mask = width_mask
    6672        self.height_mask = height_mask
  • xpra/codecs/csc_nvcuda/colorspace_converter.py

     
    260260        self.dst_width = 0
    261261        self.dst_height = 0
    262262        self.dst_format = ""
     263        self.width_stride_rounding = 0
     264        self.height_stride_rounding = 0
    263265        self.time = 0
    264266        self.frames = 0
    265267        self.kernel_function = None
    266268        self.context = None
    267269
    268270    def init_context(self, src_width, src_height, src_format,
    269                            dst_width, dst_height, dst_format, speed=100):  #@DuplicatedSignature
     271                           dst_width, dst_height, dst_format,
     272                           width_stride_rounding, height_stride_rounding,
     273                           speed=100):  #@DuplicatedSignature
    270274        validate_in_out(src_format, dst_format)
    271275        init_context()
    272276        self.src_width = src_width
     
    275279        self.dst_width = dst_width
    276280        self.dst_height = dst_height
    277281        self.dst_format = dst_format
     282        self.width_stride_rounding = width_stride_rounding
     283        self.height_stride_rounding = height_stride_rounding
    278284        self.context = context
    279285        k = (src_format, dst_format)
    280286        npp_fn = COLORSPACES_MAP.get(k)
     
    452458            driver.memcpy_dtoh_async(pixels, out_buf, stream)
    453459        else:
    454460            #we don't want the crazy large GPU padding, so we do it ourselves:
    455             stride = width*4
     461            stride = roundup(width*4, self.width_stride_rounding)
    456462            pixels = driver.pagelocked_empty(stride*height, dtype=numpy.byte)
    457463            copy = driver.Memcpy2D()
    458464            copy.set_src_device(out_buf)
     
    508514        out_sizes = []
    509515        for i in range(3):
    510516            x_div, y_div = divs[i]
    511             out_stride = roundup(width/x_div, 4)
    512             out_height = roundup(height/y_div, 2)
     517            out_stride = roundup(width/x_div, self.width_stride_rounding)
     518            out_height = roundup(height/y_div, self.height_stride_rounding)
    513519            out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4)
    514520            out_bufs.append(out_buf)
    515521            out_strides.append(out_stride)
  • xpra/codecs/csc_opencl/colorspace_converter.py

     
    159159    #log.info("has_same_channels(%s, %s)=%s (%s - %s)", src, dst, scheck, dcheck, len(scheck)==0 and len(dcheck)==0)
    160160    return len(scheck)==0 and len(dcheck)==0
    161161
    162 KERNELS_DEFS = {}
     162
    163163def gen_yuv_to_rgb():
    164164    global context
    165165    from xpra.codecs.csc_opencl.opencl_kernels import gen_yuv_to_rgb_kernels, rgb_mode_to_indexes, indexes_to_rgb_mode
     
    316316    return RGB_to_YUV_KERNELS
    317317
    318318
     319KERNELS_DEFS = {}
    319320def gen_kernels():
    320321    """
    321322    The code here is complicated by the fact that we don't know
     
    412413        self.dst_width = 0
    413414        self.dst_height = 0
    414415        self.dst_format = ""
     416        self.width_stride_rounding = 0
     417        self.height_stride_rounding = 0
    415418        self.time = 0
    416419        self.frames = 0
    417420        self.queue = None
     
    421424        self.kernel_function_name = None
    422425
    423426    def init_context(self, src_width, src_height, src_format,
    424                            dst_width, dst_height, dst_format, csc_speed=100):  #@DuplicatedSignature
     427                           dst_width, dst_height, dst_format,
     428                           width_stride_rounding, height_stride_rounding,
     429                           csc_speed=100):  #@DuplicatedSignature
    425430        global context
    426431        debug("init_context%s", (src_width, src_height, src_format, dst_width, dst_height, dst_format, csc_speed))
    427432        validate_in_out(src_format, dst_format)
     
    431436        self.dst_width = dst_width
    432437        self.dst_height = dst_height
    433438        self.dst_format = dst_format
     439        self.width_stride_rounding = width_stride_rounding
     440        self.height_stride_rounding = height_stride_rounding
    434441        self.queue = pyopencl.CommandQueue(context)
    435442        #sampling type:
    436443        if self.src_width>self.dst_width and self.src_height>self.dst_height:
     
    557564
    558565        #output image:
    559566        oformat = pyopencl.ImageFormat(self.channel_order, pyopencl.channel_type.UNORM_INT8)
    560         oimage = pyopencl.Image(context, mem_flags.WRITE_ONLY, oformat, shape=(self.dst_width, self.dst_height))
     567        outstride = roundup(self.dst_width, self.width_stride_rounding)
     568        outheight = roundup(self.dst_height, self.height_stride_rounding)
     569        oimage = pyopencl.Image(context, mem_flags.WRITE_ONLY, oformat, shape=(outstride, outheight))
    561570
    562571        iformat = pyopencl.ImageFormat(pyopencl.channel_order.R, pyopencl.channel_type.UNSIGNED_INT8)
    563572        for i in range(3):
     
    571580            iimage = pyopencl.Image(context, flags, iformat, shape=shape, hostbuf=plane)
    572581            kernelargs.append(iimage)
    573582
     583        #FIXME: pass out stride!
    574584        kernelargs += [numpy.int32(self.src_width), numpy.int32(self.src_height),
    575585                       numpy.int32(self.dst_width), numpy.int32(self.dst_height),
    576586                       self.sampler, oimage]
     
    582592        kend = time.time()
    583593        debug("%s took %.1fms", self.kernel_function, 1000.0*(kend-kstart))
    584594
    585         out_array = numpy.empty(self.dst_width*self.dst_height*4, dtype=numpy.byte)
     595        out_array = numpy.empty(outstride*outheight*4, dtype=numpy.byte)
    586596        pyopencl.enqueue_read_image(self.queue, oimage, origin=(0, 0), region=(self.dst_width, self.dst_height), hostbuf=out_array, is_blocking=True)
    587597        self.queue.finish()
    588598        debug("readback using %s took %.1fms", CHANNEL_ORDER_TO_STR.get(self.channel_order), 1000.0*(time.time()-kend))
     
    607617        #adjust work dimensions for subsampling:
    608618        #(we process N pixels at a time in each dimension)
    609619        divs = get_subsampling_divs(self.dst_format)
     620        assert len(divs) in ImageWrapper.PLANE_OPTIONS, "invalid number of planes: %s" % len(divs)
    610621        wwidth = dimdiv(self.dst_width, max([x_div for x_div, _ in divs]))
    611622        wheight = dimdiv(self.dst_height, max([y_div for _, y_div in divs]))
    612623        globalWorkSize, localWorkSize  = self.get_work_sizes(wwidth, wheight)
     
    635646        strides = []
    636647        out_buffers = []
    637648        out_sizes = []
    638         for i in range(3):
     649        nplanes = len(divs)
     650        for i in range(nplanes):
    639651            x_div, y_div = divs[i]
    640             p_stride = roundup(self.dst_width / x_div, max(2, localWorkSize[0]))
    641             p_height = roundup(self.dst_height / y_div, 2)
     652            p_stride = roundup(self.dst_width / x_div, max(2, localWorkSize[0], self.width_stride_rounding))
     653            p_height = roundup(self.dst_height / y_div, 2, self.height_stride_rounding)
    642654            p_size = p_stride * p_height
    643655            #debug("output buffer for channel %s: stride=%s, height=%s, size=%s", i, p_stride, p_height, p_size)
    644656            out_buf = pyopencl.Buffer(context, mem_flags.WRITE_ONLY, p_size)
     
    656668        #read back:
    657669        pixels = []
    658670        read_events = []
    659         for i in range(3):
     671        for i in range(nplanes):
    660672            out_array = numpy.empty(out_sizes[i], dtype=numpy.byte)
    661673            pixels.append(out_array.data)
    662674            read = pyopencl.enqueue_read_buffer(self.queue, out_buffers[i], out_array, is_blocking=False)
     
    667679        self.queue.finish()
    668680        readend = time.time()
    669681        debug("wait for read events took %.1fms", 1000.0*(readend-readstart))
    670         return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)
     682        return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=nplanes)
  • xpra/codecs/csc_opencl/opencl_kernels.py

     
    393393    return kname, kstr % args
    394394
    395395
     396def gen_rgb_to_nv12_kernel(rgb_mode):
     397    RGB_args = rgb_indexes(rgb_mode)
     398    #kernel args: R, G, B are used 12 times each:
     399    kname = "%s_to_nv12" % indexes_to_rgb_mode(RGB_args)
     400    args = tuple([kname]+RGB_args*12)
     401
     402    kstr = """
     403__kernel void %s(read_only image2d_t src,
     404              uint srcw, uint srch, uint w, uint h,
     405              const sampler_t sampler,
     406              global uchar *dstY, uint strideY,
     407              global uchar *dstUV, uint strideUV) {
     408    uint gx = get_global_id(0);
     409    uint gy = get_global_id(1);
     410
     411    if ((gx*2 < w) & (gy*2 < h)) {
     412        uint srcx = gx*2*srcw/w;
     413        uint srcy = gy*2*srch/h;
     414        uint4 p1 = read_imageui(src, sampler, (int2)( srcx, srcy ));
     415        uint4 p2 = p1;
     416        uint4 p3 = p1;
     417        uint4 p4 = p1;
     418
     419        //write up to 4 Y pixels:
     420        float Y1 =  (0.257 * p1.s%s + 0.504 * p1.s%s + 0.098 * p1.s%s + 16);
     421        //same logic as 422P for missing pixels:
     422        uint i = gx*2 + gy*2*strideY;
     423        dstY[i] = convert_uchar_rte(Y1);
     424        if (gx*2+1 < w) {
     425            srcx = (gx*2+1)*srcw/w;
     426            p2 = read_imageui(src, sampler, (int2)( srcx, srcy ));
     427            float Y2 =  (0.257 * p2.s%s + 0.504 * p2.s%s + 0.098 * p2.s%s + 16);
     428            dstY[i+1] = convert_uchar_rte(Y2);
     429        }
     430        if (gy*2+1 < h) {
     431            i += strideY;
     432            srcx = gx*2*srcw/w;
     433            srcy = (gy*2+1)*srch/h;
     434            p3 = read_imageui(src, sampler, (int2)( srcx, srcy ));
     435            float Y3 =  (0.257 * p3.s%s + 0.504 * p3.s%s + 0.098 * p3.s%s + 16);
     436            dstY[i] = convert_uchar_rte(Y3);
     437            if (gx*2+1 < w) {
     438                srcx = (gx*2+1)*srcw/w;
     439                p4 = read_imageui(src, sampler, (int2)( srcx, srcy ));
     440                float Y4 =  (0.257 * p4.s%s + 0.504 * p4.s%s + 0.098 * p4.s%s + 16);
     441                dstY[i+1] = convert_uchar_rte(Y4);
     442            }
     443        }
     444        uint UVpos = gy*strideUV + gx*2;
     445
     446        //write 1 U pixel:
     447        float U1 = (-0.148 * p1.s%s - 0.291 * p1.s%s + 0.439 * p1.s%s + 128);
     448        float U2 = (-0.148 * p2.s%s - 0.291 * p2.s%s + 0.439 * p2.s%s + 128);
     449        float U3 = (-0.148 * p3.s%s - 0.291 * p3.s%s + 0.439 * p3.s%s + 128);
     450        float U4 = (-0.148 * p4.s%s - 0.291 * p4.s%s + 0.439 * p4.s%s + 128);
     451        dstUV[UVpos] = convert_uchar_rte((U1+U2+U3+U4)/4.0);
     452
     453        //write 1 V pixel:
     454        float V1 =  (0.439 * p1.s%s - 0.368 * p1.s%s - 0.071 * p1.s%s + 128);
     455        float V2 =  (0.439 * p2.s%s - 0.368 * p2.s%s - 0.071 * p2.s%s + 128);
     456        float V3 =  (0.439 * p3.s%s - 0.368 * p3.s%s - 0.071 * p3.s%s + 128);
     457        float V4 =  (0.439 * p4.s%s - 0.368 * p4.s%s - 0.071 * p4.s%s + 128);
     458        dstUV[UVpos + 1] = convert_uchar_rte((V1+V2+V3+V4)/4.0);
     459    }
     460}
     461"""
     462    return kname, kstr % args
     463
     464
    396465RGB_to_YUV_generators = {
    397466                    "YUV444P"   : gen_rgb_to_yuv444p_kernel,
    398467                    "YUV422P"   : gen_rgb_to_yuv422p_kernel,
    399468                    "YUV420P"   : gen_rgb_to_yuv420p_kernel,
     469                    "NV12"      : gen_rgb_to_nv12_kernel,
    400470                    }
    401471
    402472def gen_rgb_to_yuv_kernels(rgb_mode="RGBX", yuv_modes=YUV_FORMATS):
  • xpra/codecs/csc_swscale/colorspace_converter.pyx

     
    8181COLORSPACES = []
    8282#keeping this array in scope ensures the strings don't go away!
    8383FORMAT_OPTIONS = [
    84 #    ("AV_PIX_FMT_NV12",     (1, 1, 0, 0),       (1, 0.5, 0, 0),     "NV12"),
     84    ("AV_PIX_FMT_NV12",     (1, 1, 0, 0),       (1, 0.5, 0, 0),     "NV12"),
    8585    ("AV_PIX_FMT_RGB24",    (3, 0, 0, 0),       (1, 0, 0, 0),       "RGB"   ),
    8686    ("AV_PIX_FMT_BGR24",    (3, 0, 0, 0),       (1, 0, 0, 0),       "BGR"   ),
    8787    ("AV_PIX_FMT_0RGB",     (4, 0, 0, 0),       (1, 0, 0, 0),       "XRGB"  ),
     
    238238    cdef int buffer_size
    239239
    240240    def init_context(self, int src_width, int src_height, src_format,
    241                            int dst_width, int dst_height, dst_format, int speed=100):    #@DuplicatedSignature
     241                           int dst_width, int dst_height, dst_format,
     242                           width_stride_rounding, height_stride_rounding,
     243                           int speed=100):    #@DuplicatedSignature
    242244        debug("swscale.ColorspaceConverter.init_context%s", (src_width, src_height, src_format, dst_width, dst_height, dst_format, speed))
    243245        cdef CSCPixelFormat src
    244246        cdef CSCPixelFormat dst
     
    258260        self.buffer_size = 0
    259261        for i in range(4):
    260262            self.out_height[i] = (int) (dst_height * dst.height_mult[i])
    261             self.out_stride[i] = roundup((int) (dst_width * dst.width_mult[i]), 4)
    262             #add one extra line to height so we can read a full rowstride
    263             #no matter where we start to read on the last line.
    264             #MEMALIGN may be redundant here but it is very cheap
    265             if dst_format=="NV12" and i==0:
    266                 #no padding: packed UV plane follows Y plane
    267                 self.out_size[i] = self.out_stride[i] * self.out_height[i]
    268             else:
    269                 self.out_size[i] = pad(self.out_stride[i] * (self.out_height[i]+1))
     263            self.out_stride[i] = roundup((int) (dst_width * dst.width_mult[i]), self.width_stride_rounding)
     264            #ensure we always add at least one extra line to height via roundup
     265            #so we can safely read a full rowstride at a time
     266            #no matter where we start to read on the last line of the output image
     267            padded_height = roundup(self.out_height[i]+1, self.height_stride_rounding)
     268            self.out_size[i] = pad(self.out_stride[i] * padded_height)
    270269            self.buffer_size += self.out_size[i]
    271270        debug("buffer size=%s", self.buffer_size)
    272271
     
    274273        self.src_height = src_height
    275274        self.dst_width = dst_width
    276275        self.dst_height = dst_height
     276        self.width_stride_rounding = width_stride_rounding
     277        self.height_stride_rounding = height_stride_rounding
    277278
    278279        self.flags = get_swscale_flags(speed)
    279280        self.time = 0
     
    382383        csci = CSCImage()           #keep a reference to memory for cleanup
    383384        for i in range(4):
    384385            csci.set_plane(i, NULL)
    385         if self.dst_format.endswith("P"):
     386        if self.dst_format.endswith("P") or str(self.dst_format)=="NV12":
    386387            #planar mode, assume 3 planes:
    387388            oplanes = ImageWrapper._3_PLANES
     389            if str(self.dst_format)=="NV12":
     390                oplanes = ImageWrapper._2_PLANES
    388391            out = []
    389392            strides = []
    390             for i in range(3):
     393            for i in range(oplanes):
    391394                if self.out_stride[i]>0 and output_image[i]!=NULL:
    392395                    stride = self.out_stride[i]
    393                     plane = PyBuffer_FromMemory(<void *>output_image[i], self.out_height[i] * self.out_stride[i])
     396                    plane = PyBuffer_FromMemory(<void *>output_image[i], self.out_size[i])
    394397                else:
    395398                    stride = 0
    396399                    plane = None
    397400                csci.set_plane(i, output_image[i])
    398401                out.append(plane)
    399402                strides.append(stride)
    400         elif str(self.dst_format)=="NV12":
    401             #Y plane, followed by U and V packed
    402             oplanes = ImageWrapper.PACKED
    403             strides = self.out_stride[0]
    404             out = PyBuffer_FromMemory(<void *>output_image[0], self.buffer_size)
    405             csci.set_plane(0, output_image[0])
    406403        else:
    407404            #assume no planes, plain RGB packed pixels:
    408405            oplanes = ImageWrapper.PACKED
    409406            strides = self.out_stride[0]
    410             out = PyBuffer_FromMemory(<void *>output_image[0], self.out_height[0] * self.out_stride[0])
     407            out = PyBuffer_FromMemory(<void *>output_image[0], self.out_size[0])
    411408            csci.set_plane(0, output_image[0])
    412409        elapsed = time.time()-start
    413410        debug("%s took %.1fms", self, 1000.0*elapsed)
  • xpra/codecs/image_wrapper.py

     
    88class ImageWrapper(object):
    99
    1010    PACKED = 0
     11    _2_PLANES = 2
    1112    _3_PLANES = 3
    1213    _4_PLANES = 4
    1314    PLANE_OPTIONS = (PACKED, _3_PLANES, _4_PLANES)
    1415    PLANE_NAMES = {PACKED       : "PACKED",
     16                   _2_PLANES    : "2_PLANES",
    1517                   _3_PLANES    : "3_PLANES",
    1618                   _4_PLANES    : "4_PLANES"}
    1719
  • xpra/codecs/nvenc/encoder.pyx

     
    10241024    return codec_spec(Encoder, codec_type=get_type(), encoding=encoding,
    10251025                      quality=60, setup_cost=100, cpu_cost=10, gpu_cost=100,
    10261026                      min_w=2, min_h=2, max_w=4096, max_h=4096,
    1027                       width_mask=0xFFFE, height_mask=0xFFFE)
     1027                      width_mask=0xFFFE, height_mask=0xFFFE,
     1028                      width_stride_rounding=32, height_stride_rounding=32)
    10281029
    10291030
    10301031def get_version():