xpra icon
Bug tracker and wiki

This bug tracker and wiki are being discontinued
please use https://github.com/Xpra-org/xpra instead.


Ticket #384: csc_nvcuda-v7.patch

File csc_nvcuda-v7.patch, 46.3 KB (added by Antoine Martin, 8 years ago)

work in progress pycuda code: done rgb to yuv

  • xpra/codecs/csc_nvcuda/colorspace_converter.py

     
    11# This file is part of Xpra.
    2 # Copyright (C) 2013 Arthur Huillet
    32# Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk>
    43# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
    54# later version. See the file COPYING for details.
    65
     6
     7from xpra.log import Logger, debug_if_env
     8log = Logger()
     9debug = debug_if_env(log, "XPRA_CUDA_DEBUG")
     10error = log.error
     11
     12import numpy
    713import time
     14import ctypes
     15import sys
     16assert bytearray
     17import pycuda               #@UnresolvedImport
     18from pycuda import driver   #@UnresolvedImport
     19driver.init()
     20
     21log.info("PyCUDA version=%s", ".".join([str(x) for x in driver.get_version()]))
     22log.info("PyCUDA driver version=%s", driver.get_driver_version())
     23
     24ngpus = driver.Device.count()
     25log.info("PyCUDA found %s devices:", ngpus)
     26selected_device = None
     27for i in range(ngpus):
     28    d = driver.Device(i)
     29    attr = d.get_attributes()
     30    debug("compute_capability=%s, attributes=%s", d.compute_capability(), attr)
     31    host_mem = d.get_attribute(driver.device_attribute.CAN_MAP_HOST_MEMORY)
     32    debug("CAN_MAP_HOST_MEMORY=%s", host_mem)
     33    pre = "-"
     34    if host_mem:
     35        pre = "+"
     36    log.info(" %s %s @ %s (%sMB)", pre, d.name(), d.pci_bus_id(), int(d.total_memory()/1024/1024))
     37    if host_mem and selected_device is None:
     38        selected_device = d
     39assert selected_device is not None
     40context = selected_device.make_context(flags=driver.ctx_flags.SCHED_YIELD | driver.ctx_flags.MAP_HOST)
     41debug("testing with context=%s", context)
     42debug("api version=%s", context.get_api_version())
     43free, total = driver.mem_get_info()
     44debug("using device %s, memory: free=%sMB, total=%sMB",  selected_device, int(free/1024/1024), int(total/1024/1024))
     45context.pop()
     46#context.detach()
     47#del context
     48
     49
     50def find_lib(basename):
     51    try:
     52        if sys.platform == "win32":
     53            libname = basename+".dll"
     54        else:
     55            libname = basename+".so"
     56        return ctypes.cdll.LoadLibrary(libname)
     57    except Exception, e:
     58        debug("could not find %s: %s", basename, e)
     59        return None
     60   
     61_NPP_LIBRARY_NAMES = ["libnppi",    #CUDA5.5
     62                      "libnpp"]     #CUDA5.0
     63_NPP_LIBRARIES = []
     64for name in _NPP_LIBRARY_NAMES:
     65    lib = find_lib(name)
     66    if lib:
     67        _NPP_LIBRARIES.append(lib)
     68if len(_NPP_LIBRARIES)==0:
     69    raise ImportError("failed to load npp library - check your library path")
     70CUDART_LIBRARY = find_lib("libcudart")
     71assert hasattr(CUDART_LIBRARY, "cudaMalloc"), "cudaMalloc not found in %s" % CUDART_LIBRARY
     72cudaMalloc = CUDART_LIBRARY.cudaMalloc
     73cudaMalloc.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(ctypes.c_size_t)]
     74
     75from xpra.codecs.image_wrapper import ImageWrapper
    876from xpra.codecs.codec_constants import codec_spec, get_subsampling_divs
    9 from xpra.codecs.image_wrapper import ImageWrapper
    1077
    11 cdef extern from "stdlib.h":
    12     void free(void *ptr)
    1378
    14 cdef extern from "Python.h":
    15     ctypedef int Py_ssize_t
    16     ctypedef object PyObject
    17     object PyBuffer_FromMemory(void *ptr, Py_ssize_t size)
    18     int PyObject_AsReadBuffer(object obj, void ** buffer, Py_ssize_t * buffer_len) except -1
     79class NppiSize(ctypes.Structure):
     80    _fields_ = [("width", ctypes.c_int),
     81                ("height", ctypes.c_int)]
    1982
    20 ctypedef unsigned char uint8_t
    21 ctypedef void csc_nvcuda_ctx
    22 cdef extern from "csc_nvcuda.h":
    23 #char **get_supported_colorspaces()
     83RGB_to_YUV444P_argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, ctypes.c_int, NppiSize]
     84RGB_to_YUV42xP_argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, (ctypes.c_int)*3, NppiSize]
    2485
    25     int init_cuda()
    26     csc_nvcuda_ctx *init_csc(int src_width, int src_height, const char *src_format, const char *dst_format)
    27     void free_csc(csc_nvcuda_ctx *ctx)
    28     int csc_image(csc_nvcuda_ctx *ctx, const uint8_t *input_image[3], const int in_stride[3], uint8_t *out[3], int out_stride[3]) nogil
    29     void free_csc_image(uint8_t *buf[3])
    30     char *get_NPP_version()
    3186
     87COLORSPACES_MAP_STR  = {
     88                    ("RGBX",    "YUV444P")  : ("nppiRGBToYCbCr_8u_C3P3R",   RGB_to_YUV444P_argtypes),
     89                    ("RGBA",    "YUV444P")  : ("nppiRGBToYCbCr_8u_AC4P3R",  RGB_to_YUV444P_argtypes),
     90                    ("YUV444P", "RGB")      : ("nppiYCbCrToRGB_8u_P3C3R",   []),
     91                    ("YUV444P", "BGR")      : ("nppiYCbCrToBGR_8u_P3C3R",   []),
     92                    ("YUV444P", "RGBX")     : ("nppiYCbCrToRGB_8u_P3C4R",   []),
     93                    ("YUV444P", "BGRX")     : ("nppiYCbCrToBGR_8u_P3C4R",   []),
     94                    #BGR / BGRA: need nppiSwap(Channels before one of the above
     95                    ("RGBX",    "YUV422P")  : ("nppiRGBToYCbCr422_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
     96                    ("BGRX",    "YUV422P")  : ("nppiBGRToYCbCr422_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
     97                    ("BGRA",    "YUV422P")  : ("nppiBGRToYCbCr422_8u_AC4P3R",   RGB_to_YUV42xP_argtypes),
     98                    ("YUV422P", "RGB")      : ("nppiYCbCr422ToRGB_8u_P3C3R",    []),
     99                    ("YUV422P", "BGR")      : ("nppiYCbCr422ToBGR_8u_P3C3R",    []),
     100                    #YUV420P:
     101                    ("RGBX",    "YUV420P")  : ("nppiRGBToYCbCr420_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
     102                    ("BGRX",    "YUV420P")  : ("nppiBGRToYCbCr420_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
     103                    ("RGBA",    "YUV420P")  : ("nppiRGBToYCrCb420_8u_AC4P3R",   RGB_to_YUV42xP_argtypes),
     104                    ("BGRA",    "YUV420P")  : ("nppiBGRToYCbCr420_8u_AC4P3R",   RGB_to_YUV42xP_argtypes),
     105                    ("YUV420P", "RGB")      : ("nppiYCbCr420ToRGB_8u_P3C3R",    []),
     106                    ("YUV420P", "BGR")      : ("nppiYCbCr420ToBGR_8u_P3C3R",    []),
     107                    ("YUV420P", "RGBX")     : ("nppiYCrCb420ToRGB_8u_P3C4R",    []),
     108                    ("YUV420P", "BGRX")     : ("nppiYCbCr420ToBGR_8u_P3C4R",    []),
     109                    }
     110#ie:
     111#BGR to YUV420P:
     112#NppStatus nppiBGRToYCbCr420_8u_C3P3R (const Npp8u *pSrc, int nSrcStep, Npp8u *pDst[3], int rDstStep[3], NppiSize oSizeROI)
     113#pSrc Source-Image Pointer.
     114#nSrcStep Source-Image Line Step.
     115#pDst Destination-Planar-Image Pointer Array.
     116#rDstStep Destination-Planar-Image Line Step Array.
     117#oSizeROI Region-of-Interest (ROI). (struct with width and height)
     118#Returns:
     119#Image Data Related Error Codes, ROI Related Error Codes
    32120
    33 _init = None
    34 def init():
    35     global _init
    36     if _init is None:
    37         _init = init_cuda()==0
    38     return _init
    39 init()
     121#For YUV444P:
     122#NppStatus nppiRGBToYCbCr_8u_C3P3R(const Npp8u * pSrc, int nSrcStep, Npp8u * pDst[3], int nDstStep, NppiSize oSizeROI);
     123#(only one nDstStep!
    40124
    41 COLORSPACES_SRC = [ "RGB", "RGBA", "BGR", "BGRX" ]
    42 COLORSPACES_DST = [ "YUV420P", "YUV422P", "YUV444P" ]
     125#YUV420P to RGB:
     126#NppStatus nppiYCbCrToRGB_8u_P3C3R(const Npp8u * const pSrc[3], int nSrcStep, Npp8u * pDst, int nDstStep, NppiSize oSizeROI);
     127#YUV444P to RGB:
     128#NppStatus nppiYCrCb420ToRGB_8u_P3C4R(const Npp8u * const pSrc[3],int rSrcStep[3], Npp8u * pDst, int nDstStep, NppiSize oSizeROI);
     129#Those with alpha add:
     130#Npp8u nAval
    43131
     132
     133WARNINGS = {
     134            #NPP_NO_OPERATION_WARNING:
     135            1   :   "Indicates that no operation was performed",
     136            #NPP_DIVIDE_BY_ZERO_WARNING:
     137            6   :   "Divisor is zero however does not terminate the execution",
     138            #NPP_AFFINE_QUAD_INCORRECT_WARNING:
     139            28  :   "Indicates that the quadrangle passed to one of affine warping functions doesn't have necessary properties. First 3 vertices are used, the fourth vertex discarded",
     140            #NPP_WRONG_INTERSECTION_ROI_WARNING
     141            29  :   "The given ROI has no interestion with either the source or destination ROI. Thus no operation was performed",
     142            #NPP_WRONG_INTERSECTION_QUAD_WARNING:
     143            30  :   "The given quadrangle has no intersection with either the source or destination ROI. Thus no operation was performed",
     144            #NPP_DOUBLE_SIZE_WARNING:
     145            35  :   "Image size isn't multiple of two. Indicates that in case of 422/411/420 sampling the ROI width/height was modified for proper processing",
     146            #NPP_MISALIGNED_DST_ROI_WARNING:
     147            10000 : "Speed reduction due to uncoalesced memory accesses warning"
     148            }
     149
     150#typedef unsigned char       Npp8u;     /**<  8-bit unsigned chars */
     151#typedef struct
     152#{
     153#    int width;  /**<  Rectangle width. */
     154#    int height; /**<  Rectangle height. */
     155#} NppiSize;
     156#NppStatus is an enum.. (0==NPP_NO_ERROR)
     157
     158def roundup(n, m):
     159    return (n + m - 1) & ~(m - 1)
     160
     161
     162COLORSPACES_MAP = {}
     163for k, f_def in COLORSPACES_MAP_STR.items():
     164    fn, argtypes = f_def
     165    try:
     166        for lib in _NPP_LIBRARIES:
     167            if hasattr(lib, fn):
     168                cfn = getattr(lib, fn)
     169                debug("found %s for %s in %s: %s", fn, k, lib, cfn)
     170                COLORSPACES_MAP[k] = (fn, cfn)
     171                #set argument types and return type:
     172                cfn.restype = ctypes.c_int
     173                cfn.argtypes = argtypes
     174    except:
     175        log.error("could not load '%s', conversion disabled: %s", fn, k)
     176
     177
     178def get_type():
     179    return "nvcuda"
     180
    44181def get_version():
    45     return get_NPP_version()
     182    return pycuda.VERSION_TEXT
    46183
    47184def get_input_colorspaces():
    48     if not init():
    49         return []
    50     return COLORSPACES_SRC
     185    return sorted(set([src for src, _ in COLORSPACES_MAP.keys()]))
    51186
    52187def get_output_colorspaces(input_colorspace):
    53     if not init():
    54         return []
    55     #exclude input colorspace:
    56     return COLORSPACES_DST
     188    return sorted(set(dst for src,dst in COLORSPACES_MAP.keys() if src==input_colorspace))
    57189
     190def validate_in_out(in_colorspace, out_colorspace):
     191    assert in_colorspace in get_input_colorspaces(), "invalid input colorspace: %s (must be one of %s)" % (in_colorspace, get_input_colorspaces())
     192    assert out_colorspace in get_output_colorspaces(in_colorspace), "invalid output colorspace: %s (must be one of %s for input %s)" % (out_colorspace, get_output_colorspaces(in_colorspace), in_colorspace)
     193
    58194def get_spec(in_colorspace, out_colorspace):
    59     assert init(), "nvcuda is not available!"
    60     assert in_colorspace in COLORSPACES_SRC, "invalid input colorspace: %s (must be one of %s)" % (in_colorspace, COLORSPACES_SRC)
    61     assert out_colorspace in COLORSPACES_DST, "invalid output colorspace: %s (must be one of %s)" % (out_colorspace, COLORSPACES_DST)
     195    validate_in_out(in_colorspace, out_colorspace)
    62196    #ratings: quality, speed, setup cost, cpu cost, gpu cost, latency, max_w, max_h, max_pixels
    63197    return codec_spec(ColorspaceConverter, speed=100, setup_cost=10, cpu_cost=10, gpu_cost=50, min_w=16, min_h=16, can_scale=False)
    64198
    65199
    66 cdef class CSCImage:
    67     """
    68         Allows us to call free_csc_image
    69         when this object is garbage collected
    70     """
    71     cdef uint8_t *buf[3]
    72     cdef int freed
     200class ColorspaceConverter(object):
    73201
    74     cdef set_plane(self, int plane, uint8_t *buf):
    75         assert plane in (0, 1, 2)
    76         self.buf[plane] = buf
     202    def __init__(self):
     203        self.src_width = 0
     204        self.src_height = 0
     205        self.src_format = ""
     206        self.dst_width = 0
     207        self.dst_height = 0
     208        self.dst_format = ""
     209        self.time = 0
     210        self.frames = 0
     211        self.kernel_function = None
    77212
    78     def __dealloc__(self):
    79         #print("CSCImage.__dealloc__() calling free()")
    80         self.free()
    81 
    82     def free(self):
    83         #print("CSCImage.free() free_csc_image(..) already? %s" % self.freed)
    84         if self.freed==0:
    85             self.freed = 1
    86             free_csc_image(self.buf)
    87 
    88 
    89 class CSCImageWrapper(ImageWrapper):
    90 
    91     def free(self):                             #@DuplicatedSignature
    92         #print("YUVImageWrapper.free() csc_image=%s" % self.csc_image)
    93         if self.csc_image:
    94             self.csc_image.free()
    95             self.csc_image = None
    96 
    97 cdef class ColorspaceConverter:
    98     cdef int frames
    99     cdef csc_nvcuda_ctx *context
    100     cdef int src_width
    101     cdef int src_height
    102     cdef char* src_format
    103     cdef int dst_width
    104     cdef int dst_height
    105     cdef char* dst_format
    106     cdef double time
    107 
    108     def init_context(self, int src_width, int src_height, src_format,
    109                            int dst_width, int dst_height, dst_format, int speed=100):    #@DuplicatedSignature
     213    def init_context(self, src_width, src_height, src_format,
     214                           dst_width, dst_height, dst_format):    #@DuplicatedSignature
     215        validate_in_out(src_format, dst_format)
    110216        self.src_width = src_width
    111217        self.src_height = src_height
     218        self.src_format = src_format
    112219        self.dst_width = dst_width
    113220        self.dst_height = dst_height
    114         self.time = 0
    115         #ugly trick to use a string which won't go away from underneath us:
    116         assert src_format in COLORSPACES_SRC, "invalid source format: %s" % src_format
    117         for x in COLORSPACES_SRC:
    118             if x==src_format:
    119                 self.src_format = x
    120                 break
    121         assert dst_format in COLORSPACES_DST, "invalid destination format: %s" % dst_format
    122         for x in COLORSPACES_DST:
    123             if x==dst_format:
    124                 self.dst_format = x
    125                 break
    126         self.frames = 0
    127         self.context = init_csc(self.src_width, self.src_height, self.src_format, self.dst_format)
     221        self.dst_format = dst_format
     222        context.push()
     223        context.synchronize()
     224        k = (src_format, dst_format)
     225        npp_fn = COLORSPACES_MAP.get(k)
     226        assert npp_fn is not None, "invalid pair: %s" % k
     227        self.kernel_function_name, cfn = npp_fn
     228        debug("init_context%s npp conversion function=%s (%s)", (src_width, src_height, src_format, dst_width, dst_height, dst_format), self.kernel_function_name, cfn)
     229        self.kernel_function = cfn
     230        if src_format.find("YUV")>=0:
     231            self.convert_image = self.convert_image_yuv
     232        else:
     233            self.convert_image = self.convert_image_rgb
     234        debug("init_context(..) convert_image=%s", self.convert_image)
    128235
    129236    def get_info(self):
    130237        info = {"frames"    : self.frames,
     
    141248        return info
    142249
    143250    def __str__(self):
    144         return "nvcuda(%s %sx%s - %s %sx%s)" % (self.src_format, self.src_width, self.src_height,
     251        if self.queue is None:
     252            return "opencl(uninitialized)"
     253        return "opencl(%s %sx%s - %s %sx%s)" % (self.src_format, self.src_width, self.src_height,
    145254                                                 self.dst_format, self.dst_width, self.dst_height)
    146255
    147     def __dealloc__(self):                  #@DuplicatedSignature
     256    def is_closed(self):
     257        return False
     258
     259    def __del__(self):                  #@DuplicatedSignature
    148260        self.clean()
    149261
    150262    def get_src_width(self):
     
    166278        return self.dst_format
    167279
    168280    def get_type(self):
    169         return  "nvcuda"
     281        return  "opencl"
    170282
    171283
    172284    def clean(self):                        #@DuplicatedSignature
    173         if self.context!=NULL:
    174             free_csc(self.context)
    175             free(self.context)
    176             self.context = NULL
     285        if context:
     286            context.pop()
    177287
    178288    def convert_image(self, image):
    179         cdef Py_ssize_t pic_buf_len = 0
    180         assert self.context!=NULL
    181         cdef const uint8_t *input_image[3]
    182         cdef uint8_t *output_image[3]
    183         cdef int input_stride[3]
    184         cdef int output_stride[3]
    185         cdef int planes
    186         cdef int i                          #@DuplicatedSignature
    187         cdef int height
    188         cdef int stride
    189         cdef int result
    190         planes = image.get_planes()
    191         assert planes in (0, 1, 3), "invalid number of planes: %s" % planes
    192         input = image.get_pixels()
    193         strides = image.get_rowstride()
    194         if planes==0:
    195             #magic: if planes==0, this is an XImageWrapper... with raw pixels/rowstride
    196             input = [input]
    197             strides = [strides]
    198             planes = 1
    199         #print("convert_image(%s) input=%s, strides=%s" % (image, len(input), strides))
    200         assert len(input)==planes, "expected %s planes but found %s" % (planes, len(input))
    201         assert len(strides)==planes, "expected %s rowstrides but found %s" % (planes, len(strides))
    202         for i in range(planes):
    203             input_stride[i] = strides[i]
    204             PyObject_AsReadBuffer(input[i], <const void**> &input_image[i], &pic_buf_len)
    205         start = time.time()
    206         with nogil:
    207             result = csc_image(self.context, input_image, input_stride, output_image, output_stride)
    208         if result != 0:
     289        #we override this method during init_context
     290        raise Exception("not initialized!")
     291
     292
     293    def convert_image_rgb(self, image):
     294        global program
     295        iplanes = image.get_planes()
     296        width = image.get_width()
     297        height = image.get_height()
     298        stride = image.get_rowstride()
     299        size = image.get_size()
     300        pixels = image.get_pixels()
     301        debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels))
     302        assert iplanes==ImageWrapper.PACKED_RGB, "we only handle packed rgb as input!"
     303        assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format)
     304
     305        divs = get_subsampling_divs(self.dst_format)
     306
     307        #copy pixels to GPU:
     308        upload_start = time.time()
     309        gpu_image = driver.to_device(pixels)
     310        upload_end = time.time()
     311        debug("%s pixels now on GPU at %s, took %.1fms", size, gpu_image, upload_end-upload_start)
     312
     313        #argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, ctypes.c_int, NppiSize]
     314        #argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, (ctypes.c_int)*3, NppiSize]
     315        out_t = self.kernel_function.argtypes[2]            #ie: (ctypes.c_void_p)*3
     316        out_strides_t = self.kernel_function.argtypes[3]    #ie: (ctypes.c_int)*3 OR ctypes.c_int
     317        out_bufs = []
     318        out_strides = []
     319        out_sizes = []
     320        for i in range(3):
     321            x_div, y_div = divs[i]
     322            out_stride = roundup(width/x_div, 4)
     323            out_height = roundup(height/y_div, 2)
     324            #mem_alloc_pitch returns the real out stride:
     325            out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4)
     326            out_bufs.append(out_buf)
     327            out_strides.append(out_stride)
     328            out_sizes.append((out_stride, out_height))
     329        dest = out_t(*[ctypes.cast(int(out_buf), ctypes.c_void_p) for out_buf in out_bufs])
     330        if out_strides_t==ctypes.c_int:
     331            #one stride for all planes (this must be YUV444P)
     332            assert len(set(out_strides))==1, "more than one stride where only one expected in: %s" % out_strides
     333            out_strides = [out_strides[0]]
     334        args = [ctypes.cast(int(gpu_image), ctypes.c_void_p), stride, dest, out_strides_t(*out_strides), NppiSize(width, height)]
     335        debug("calling %s%s", self.kernel_function_name, tuple(args))
     336        kstart = time.time()
     337        v = self.kernel_function(*args)
     338        if v<0:
     339            log.error("%s%s returned %s", self.kernel_function_name, args, v)
    209340            return None
    210         end = time.time()
    211         self.time += (end-start)
     341        elif v>0:
     342            #positive return-codes indicate warnings:
     343            warning = WARNINGS.get(v, "unknown")
     344            log.warn("%s returned a warning %s: %s", self.kernel_function_name, v, warning)
     345        kend = time.time()
     346        debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0)
     347        gpu_image.free()
    212348        self.frames += 1
    213         #now parse the output:
    214         csci = CSCImage()           #keep a reference to memory for cleanup
    215         if self.dst_format.endswith("P"):
    216             nplanes = 3
    217             divs = get_subsampling_divs(self.dst_format)
    218             #print("convert_image(%s) nplanes=%s, divs=%s" % (image, nplanes, divs))
    219             out = []
    220             strides = []
    221             for i in range(nplanes):
    222                 _, dy = divs[i]
    223                 if dy==1:
    224                     height = self.dst_height
    225                 elif dy==2:
    226                     height = (self.dst_height+1)>>1
    227                 else:
    228                     raise Exception("invalid height divisor %s" % dy)
    229                 stride = output_stride[i]
    230                 if stride>0 and output_image[i]!=NULL:
    231                     plane = PyBuffer_FromMemory(<void *>output_image[i], height * stride)
    232                 else:
    233                     stride = 0
    234                     plane = None
    235                 csci.set_plane(i, output_image[i])
    236                 out.append(plane)
    237                 strides.append(stride)
    238         else:
    239             nplanes = 0
    240             strides = output_stride[0]
    241             out = PyBuffer_FromMemory(<void *>output_image[0], self.dst_height * strides)
    242             csci.set_plane(0, output_image[0])
    243         out_image = CSCImageWrapper(0, 0, self.dst_width, self.dst_height, out, self.dst_format, 24, strides, nplanes)
    244         out_image.csc_image = csci
    245         return out_image
     349        read_start = time.time()
     350        pixels = []
     351        for i in range(3):
     352            plane = driver.aligned_empty(out_sizes[i], dtype=numpy.byte)
     353            driver.memcpy_dtoh(plane, out_bufs[i])
     354            out_bufs[i].free()
     355            pixels.append(plane.data)
     356        context.synchronize()
     357        read_end = time.time()
     358        debug("read back took %.1fms", (read_end-read_start)*1000.0)
     359        return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, out_strides, planes=ImageWrapper._3_PLANES)
  • xpra/codecs/csc_nvcuda/colorspace_converter.pyx

     
    1 # This file is part of Xpra.
    2 # Copyright (C) 2013 Arthur Huillet
    3 # Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk>
    4 # Xpra is released under the terms of the GNU GPL v2, or, at your option, any
    5 # later version. See the file COPYING for details.
    6 
    7 import time
    8 from xpra.codecs.codec_constants import codec_spec, get_subsampling_divs
    9 from xpra.codecs.image_wrapper import ImageWrapper
    10 
    11 cdef extern from "stdlib.h":
    12     void free(void *ptr)
    13 
    14 cdef extern from "Python.h":
    15     ctypedef int Py_ssize_t
    16     ctypedef object PyObject
    17     object PyBuffer_FromMemory(void *ptr, Py_ssize_t size)
    18     int PyObject_AsReadBuffer(object obj, void ** buffer, Py_ssize_t * buffer_len) except -1
    19 
    20 ctypedef unsigned char uint8_t
    21 ctypedef void csc_nvcuda_ctx
    22 cdef extern from "csc_nvcuda.h":
    23 #char **get_supported_colorspaces()
    24 
    25     int init_cuda()
    26     csc_nvcuda_ctx *init_csc(int src_width, int src_height, const char *src_format, const char *dst_format)
    27     void free_csc(csc_nvcuda_ctx *ctx)
    28     int csc_image(csc_nvcuda_ctx *ctx, const uint8_t *input_image[3], const int in_stride[3], uint8_t *out[3], int out_stride[3]) nogil
    29     void free_csc_image(uint8_t *buf[3])
    30     char *get_NPP_version()
    31 
    32 
    33 _init = None
    34 def init():
    35     global _init
    36     if _init is None:
    37         _init = init_cuda()==0
    38     return _init
    39 init()
    40 
    41 COLORSPACES_SRC = [ "RGB", "RGBA", "BGR", "BGRX" ]
    42 COLORSPACES_DST = [ "YUV420P", "YUV422P", "YUV444P" ]
    43 
    44 def get_version():
    45     return get_NPP_version()
    46 
    47 def get_input_colorspaces():
    48     if not init():
    49         return []
    50     return COLORSPACES_SRC
    51 
    52 def get_output_colorspaces(input_colorspace):
    53     if not init():
    54         return []
    55     #exclude input colorspace:
    56     return COLORSPACES_DST
    57 
    58 def get_spec(in_colorspace, out_colorspace):
    59     assert init(), "nvcuda is not available!"
    60     assert in_colorspace in COLORSPACES_SRC, "invalid input colorspace: %s (must be one of %s)" % (in_colorspace, COLORSPACES_SRC)
    61     assert out_colorspace in COLORSPACES_DST, "invalid output colorspace: %s (must be one of %s)" % (out_colorspace, COLORSPACES_DST)
    62     #ratings: quality, speed, setup cost, cpu cost, gpu cost, latency, max_w, max_h, max_pixels
    63     return codec_spec(ColorspaceConverter, speed=100, setup_cost=10, cpu_cost=10, gpu_cost=50, min_w=16, min_h=16, can_scale=False)
    64 
    65 
    66 cdef class CSCImage:
    67     """
    68         Allows us to call free_csc_image
    69         when this object is garbage collected
    70     """
    71     cdef uint8_t *buf[3]
    72     cdef int freed
    73 
    74     cdef set_plane(self, int plane, uint8_t *buf):
    75         assert plane in (0, 1, 2)
    76         self.buf[plane] = buf
    77 
    78     def __dealloc__(self):
    79         #print("CSCImage.__dealloc__() calling free()")
    80         self.free()
    81 
    82     def free(self):
    83         #print("CSCImage.free() free_csc_image(..) already? %s" % self.freed)
    84         if self.freed==0:
    85             self.freed = 1
    86             free_csc_image(self.buf)
    87 
    88 
    89 class CSCImageWrapper(ImageWrapper):
    90 
    91     def free(self):                             #@DuplicatedSignature
    92         #print("YUVImageWrapper.free() csc_image=%s" % self.csc_image)
    93         if self.csc_image:
    94             self.csc_image.free()
    95             self.csc_image = None
    96 
    97 cdef class ColorspaceConverter:
    98     cdef int frames
    99     cdef csc_nvcuda_ctx *context
    100     cdef int src_width
    101     cdef int src_height
    102     cdef char* src_format
    103     cdef int dst_width
    104     cdef int dst_height
    105     cdef char* dst_format
    106     cdef double time
    107 
    108     def init_context(self, int src_width, int src_height, src_format,
    109                            int dst_width, int dst_height, dst_format, int speed=100):    #@DuplicatedSignature
    110         self.src_width = src_width
    111         self.src_height = src_height
    112         self.dst_width = dst_width
    113         self.dst_height = dst_height
    114         self.time = 0
    115         #ugly trick to use a string which won't go away from underneath us:
    116         assert src_format in COLORSPACES_SRC, "invalid source format: %s" % src_format
    117         for x in COLORSPACES_SRC:
    118             if x==src_format:
    119                 self.src_format = x
    120                 break
    121         assert dst_format in COLORSPACES_DST, "invalid destination format: %s" % dst_format
    122         for x in COLORSPACES_DST:
    123             if x==dst_format:
    124                 self.dst_format = x
    125                 break
    126         self.frames = 0
    127         self.context = init_csc(self.src_width, self.src_height, self.src_format, self.dst_format)
    128 
    129     def get_info(self):
    130         info = {"frames"    : self.frames,
    131                 "src_width" : self.src_width,
    132                 "src_height": self.src_height,
    133                 "src_format": self.src_format,
    134                 "dst_width" : self.dst_width,
    135                 "dst_height": self.dst_height,
    136                 "dst_format": self.dst_format}
    137         if self.frames>0 and self.time>0:
    138             pps = float(self.src_width) * float(self.src_height) * float(self.frames) / self.time
    139             info["total_time_ms"] = int(self.time*1000.0)
    140             info["pixels_per_second"] = int(pps)
    141         return info
    142 
    143     def __str__(self):
    144         return "nvcuda(%s %sx%s - %s %sx%s)" % (self.src_format, self.src_width, self.src_height,
    145                                                  self.dst_format, self.dst_width, self.dst_height)
    146 
    147     def __dealloc__(self):                  #@DuplicatedSignature
    148         self.clean()
    149 
    150     def get_src_width(self):
    151         return self.src_width
    152 
    153     def get_src_height(self):
    154         return self.src_height
    155 
    156     def get_src_format(self):
    157         return self.src_format
    158 
    159     def get_dst_width(self):
    160         return self.dst_width
    161 
    162     def get_dst_height(self):
    163         return self.dst_height
    164 
    165     def get_dst_format(self):
    166         return self.dst_format
    167 
    168     def get_type(self):
    169         return  "nvcuda"
    170 
    171 
    172     def clean(self):                        #@DuplicatedSignature
    173         if self.context!=NULL:
    174             free_csc(self.context)
    175             free(self.context)
    176             self.context = NULL
    177 
    178     def convert_image(self, image):
    179         cdef Py_ssize_t pic_buf_len = 0
    180         assert self.context!=NULL
    181         cdef const uint8_t *input_image[3]
    182         cdef uint8_t *output_image[3]
    183         cdef int input_stride[3]
    184         cdef int output_stride[3]
    185         cdef int planes
    186         cdef int i                          #@DuplicatedSignature
    187         cdef int height
    188         cdef int stride
    189         cdef int result
    190         planes = image.get_planes()
    191         assert planes in (0, 1, 3), "invalid number of planes: %s" % planes
    192         input = image.get_pixels()
    193         strides = image.get_rowstride()
    194         if planes==0:
    195             #magic: if planes==0, this is an XImageWrapper... with raw pixels/rowstride
    196             input = [input]
    197             strides = [strides]
    198             planes = 1
    199         #print("convert_image(%s) input=%s, strides=%s" % (image, len(input), strides))
    200         assert len(input)==planes, "expected %s planes but found %s" % (planes, len(input))
    201         assert len(strides)==planes, "expected %s rowstrides but found %s" % (planes, len(strides))
    202         for i in range(planes):
    203             input_stride[i] = strides[i]
    204             PyObject_AsReadBuffer(input[i], <const void**> &input_image[i], &pic_buf_len)
    205         start = time.time()
    206         with nogil:
    207             result = csc_image(self.context, input_image, input_stride, output_image, output_stride)
    208         if result != 0:
    209             return None
    210         end = time.time()
    211         self.time += (end-start)
    212         self.frames += 1
    213         #now parse the output:
    214         csci = CSCImage()           #keep a reference to memory for cleanup
    215         if self.dst_format.endswith("P"):
    216             nplanes = 3
    217             divs = get_subsampling_divs(self.dst_format)
    218             #print("convert_image(%s) nplanes=%s, divs=%s" % (image, nplanes, divs))
    219             out = []
    220             strides = []
    221             for i in range(nplanes):
    222                 _, dy = divs[i]
    223                 if dy==1:
    224                     height = self.dst_height
    225                 elif dy==2:
    226                     height = (self.dst_height+1)>>1
    227                 else:
    228                     raise Exception("invalid height divisor %s" % dy)
    229                 stride = output_stride[i]
    230                 if stride>0 and output_image[i]!=NULL:
    231                     plane = PyBuffer_FromMemory(<void *>output_image[i], height * stride)
    232                 else:
    233                     stride = 0
    234                     plane = None
    235                 csci.set_plane(i, output_image[i])
    236                 out.append(plane)
    237                 strides.append(stride)
    238         else:
    239             nplanes = 0
    240             strides = output_stride[0]
    241             out = PyBuffer_FromMemory(<void *>output_image[0], self.dst_height * strides)
    242             csci.set_plane(0, output_image[0])
    243         out_image = CSCImageWrapper(0, 0, self.dst_width, self.dst_height, out, self.dst_format, 24, strides, nplanes)
    244         out_image.csc_image = csci
    245         return out_image
  • xpra/codecs/csc_nvcuda/csc_nvcuda.c

     
    1 /* This file is part of Xpra.
    2  * Copyright (C) 2013 Arthur Huillet
    3  * Xpra is released under the terms of the GNU GPL v2, or, at your option, any
    4  * later version. See the file COPYING for details.
    5  */
    6 
    7 #include <stdio.h>
    8 #include <stdlib.h>
    9 #include <string.h>
    10 #include <stdarg.h>
    11 
    12 #include <cuda.h>
    13 #include <cuda_runtime.h>
    14 #include <npp.h>
    15 
    16 #ifndef _WIN32
    17 #include <stdint.h>
    18 #include <unistd.h>
    19 #else
    20 #include "stdint.h"
    21 #include "inttypes.h"
    22 #endif
    23 
    24 #include "csc_nvcuda.h"
    25 
    26 static int cuda_device = -1;
    27 static int cuda_initialized = 0;
    28 static CUcontext *cuda_context;
    29 
    30 enum colorspace {
    31         UNKNOWN=-1,
    32         RGB = 0,
    33         RGBA,
    34         BGR,
    35         BGRA,
    36         YUV420P,
    37         YUV422P,
    38         YUV444P,
    39 };
    40 
    41 struct csc_nvcuda_ctx {
    42         int width;
    43         int height;
    44         enum colorspace src_colorspace;
    45         enum colorspace dst_colorspace;
    46 };
    47 
    48 static const struct {
    49         enum colorspace cspace;
    50         const char *name;
    51 } colorspaces[] = {
    52                 { RGB,     "RGB"     },
    53                 { RGBA,    "RGBA"    },
    54                 { BGR,     "BGR"     },
    55                 { BGRA,    "BGRX"    },
    56                 { YUV420P, "YUV420P" },
    57                 { YUV422P, "YUV422P" },
    58                 { YUV444P, "YUV444P" },
    59 };
    60 
    61 
    62 /* Representing the functions in a single table would be quite difficult.
    63    Instead, we use several tables to represent the Npp functions to be called.
    64    */
    65 typedef NppStatus (*packed_to_subsampled_planar_func) (const Npp8u * pSrc, int nSrcStep, Npp8u * pDst[3], int rDstStep[3], NppiSize oSizeROI);
    66 typedef NppStatus (*packed_to_planar_func) (const Npp8u * pSrc, int nSrcStep, Npp8u * pDst[3], int DstStep, NppiSize oSizeROI);
    67 
    68 static packed_to_planar_func NPP_dst_YUV444P[] = {
    69         [RGB] = nppiRGBToYCbCr_8u_C3P3R,
    70         [RGBA] = nppiRGBToYCbCr_8u_AC4P3R,
    71         [BGR] = NULL, // not present in NPP, need nppiSwapChannels first
    72         [BGRA] = NULL, // same as above
    73 };
    74 
    75 static packed_to_subsampled_planar_func NPP_dst_YUV422P[] = {
    76         [RGB] = nppiRGBToYCbCr422_8u_C3P3R,
    77         [RGBA] = NULL, //WTF?
    78         [BGR] = nppiBGRToYCbCr422_8u_C3P3R,
    79         [BGRA] = nppiBGRToYCbCr422_8u_AC4P3R,
    80 };
    81 
    82 static packed_to_subsampled_planar_func NPP_dst_YUV420P[] = {
    83         [RGB] =  nppiRGBToYCbCr420_8u_C3P3R,
    84         [RGBA] =  NULL, //WTF?
    85         [BGR] =  nppiBGRToYCbCr420_8u_C3P3R,
    86         [BGRA] =  nppiBGRToYCbCr420_8u_AC4P3R,
    87 };
    88 
    89 #define ARRAY_SIZE(X) (int)(sizeof(X)/sizeof(X[0]))
    90 static enum colorspace get_colorspace_by_name(const char *str)
    91 {
    92         int i;
    93         if (!str)
    94                 return UNKNOWN;
    95 
    96         for (i = 0; i < ARRAY_SIZE(colorspaces); i++) {
    97                 if (!strcmp(str, colorspaces[i].name))
    98                         return colorspaces[i].cspace;
    99         }
    100 
    101         fprintf(stderr, "Colorspace %s not supported.\n", str);
    102         return UNKNOWN;
    103 }
    104 
    105 /* Retrieve the conversion function for a un-subsampled planar destination.
    106  This cannot be unified because the NPP signatures are different from the other variants.*/
    107 packed_to_planar_func get_conversion_function_444(enum colorspace src, enum colorspace dst)
    108 {
    109         if (dst != YUV444P)
    110                 return NULL;
    111 
    112         if (src >= ARRAY_SIZE(NPP_dst_YUV444P)) {
    113                 fprintf(stderr, "Source colorspace %d not supported for YUV444P destination\n", src);
    114                 return NULL;
    115         }
    116 
    117         return NPP_dst_YUV444P[src];
    118 }
    119 
    120 /* Retrieve the conversion function for a subsampled planar destination. */
    121 packed_to_subsampled_planar_func get_conversion_function_subsampled(enum colorspace src, enum colorspace dst)
    122 {
    123 #define get_func(ARR) do { \
    124                 if (src >= ARRAY_SIZE(ARR)) \
    125                         return NULL; \
    126                 return ARR[src];        \
    127                 } while (0)
    128         if (dst == YUV420P) {
    129                 get_func(NPP_dst_YUV420P);
    130         } else if (dst == YUV422P) {
    131                 get_func(NPP_dst_YUV422P);
    132         } else {
    133                 fprintf(stderr, "Destination colorspace %d not supported as subsampled dest.\n", dst);
    134                 return NULL;
    135         }
    136 }
    137 
    138 static void _cuda_report_error(int line, const char *fmt, ...)
    139 {
    140         fprintf(stderr, "Cuda error in %s:%d: ", __FILE__, line);
    141         va_list ap;
    142         va_start(ap, fmt);
    143         vfprintf(stderr, fmt, ap);
    144         va_end(ap);
    145         fprintf(stderr, " - %s\n", cudaGetErrorString(cudaGetLastError()));
    146 }
    147 
    148 #define cuda_err(fmt, ...) _cuda_report_error(__LINE__, fmt, ##__VA_ARGS__)
    149 
    150 const char *get_NPP_version(void)
    151 {
    152 #define xstr(s) str(s)
    153 #define str(s) #s
    154         return "" xstr(NPP_VERSION_MAJOR) "." xstr(NPP_VERSION_MINOR) "." xstr(NPP_VERSION_BUILD);
    155 }
    156 
    157 int init_cuda(void)
    158 {
    159         int cuda_count = 0;
    160         char PCI_id[25];
    161         struct cudaDeviceProp prop;
    162 
    163         if (cudaGetDeviceCount(&cuda_count)) {
    164                 fprintf(stderr, "No CUDA devices available.\n");
    165         }
    166 
    167         int i;
    168         for (i = 0; i < cuda_count; i++) {
    169                 cudaSetDevice(i);
    170 
    171                 // Retrieve device properties
    172                 if (cudaGetDeviceProperties(&prop, i)) {
    173                         cuda_err("Error retrieving Cuda device %d properties, skipping", i);
    174                         continue;
    175                 }
    176 
    177                 // Check if device is able to map host memory
    178                 if (!prop.canMapHostMemory) {
    179                         cuda_err("Device %d cannot map host memory, skipping", i);
    180                         continue;
    181                 }
    182        
    183                 // Tell CUDA we want to map host memory
    184                 if(cudaSetDeviceFlags(cudaDeviceMapHost)) {
    185                         cuda_err("Unable to set cudaDeviceMapHost device flag");
    186                         return 1;
    187                 }
    188 
    189                 // All good - select this device!
    190                 break;
    191         }
    192 
    193         if (i == cuda_count) {
    194                 fprintf(stderr, "No suitable CUDA devices available.\n");
    195                 return 1;
    196         }
    197 
    198         // Select this device
    199         cuda_device = i;
    200 
    201         const NppLibraryVersion *lib_version = nppGetLibVersion();
    202 
    203         // Report status
    204                 // This call initializes the device for real, instead of it being done later when converting frames
    205         cudaDeviceGetPCIBusId(PCI_id, sizeof(PCI_id), cuda_device);
    206         printf("Using CUDA device %s at %s, NPP version %d.%d.%d\n", nppGetGpuName(), PCI_id, lib_version->major, lib_version->minor, lib_version->build);
    207 
    208         if (cuInit(0)) {
    209                 fprintf(stderr, "cuInit failed\n");
    210         }
    211 
    212         //printf("curren = %p\n", cuCtxGetCurrent(cuda_context));
    213         printf("Cuda context ptr %p\n", cuda_context);
    214         cuda_initialized = 1;
    215         return 0;
    216 }
    217 
    218 struct csc_nvcuda_ctx *init_csc(int width, int height, const char *src_format_str, const char *dst_format_str)
    219 {
    220         struct csc_nvcuda_ctx *ctx = malloc(sizeof(struct csc_nvcuda_ctx));
    221         if (!ctx)
    222                 return NULL;
    223        
    224         ctx->width = width;
    225         ctx->height = height;
    226         ctx->src_colorspace = get_colorspace_by_name(src_format_str);
    227         ctx->dst_colorspace = get_colorspace_by_name(dst_format_str);
    228 
    229         // Check if we have a conversion function for src->dst
    230         void *func;
    231         if (ctx->dst_colorspace == YUV444P) {
    232                 func = get_conversion_function_444(ctx->src_colorspace, ctx->dst_colorspace);
    233         } else {
    234                 func = get_conversion_function_subsampled(ctx->src_colorspace, ctx->dst_colorspace);
    235         }
    236         if (!func) {
    237                 fprintf(stderr, "Colorspace conversion with source %s and destination %s is not supported by csc_nvcuda.\n", src_format_str, dst_format_str);
    238                 goto err;
    239         }
    240 
    241         // Initialize Cuda (once in the application's lifetime)                 
    242         if (!cuda_initialized) {
    243                 fprintf(stderr, "Cuda was not initialized - please call init_cuda() before init_csc(). Initializing Cuda...\n");
    244                 if (init_cuda()) {
    245                         goto err;
    246                 }
    247         }
    248 
    249         return ctx;
    250 err:
    251         free(ctx);
    252         return NULL;
    253 }
    254 
    255 int csc_image(struct csc_nvcuda_ctx *ctx, const uint8_t *in[3], const int stride[3], uint8_t *out[3], int out_stride[3])
    256 {
    257         if (!ctx)
    258                 return 1;
    259 
    260         int pinned_input_buffer = 1;
    261         int pinned_output_buffer = 1;
    262         NppiSize size = { ctx->width, ctx->height };
    263         Npp8u *src = NULL; // GPU-side input buffer
    264         //uint8_t *dstbuf = NULL; // CPU-side linear output buffer (data + strides)
    265         uint8_t *gpudst[3] = { NULL, NULL, NULL }; // GPU-side planar output array
    266 
    267         // Plane dimensions
    268         int y_width = ctx->width;
    269         int uv_width = ctx->width;
    270         int uv_height = ctx->height;
    271 
    272         switch (ctx->dst_colorspace) {
    273                 case YUV420P:
    274                         uv_height /=  2;
    275                         /* fall through */
    276                 case YUV422P:
    277                         uv_width /= 2;
    278                         break;
    279                 case YUV444P:
    280                         ;
    281                 default:
    282                         fprintf(stderr, "%s: Unimplemented destination colorspace: %d\n", __FUNCTION__, ctx->dst_colorspace);
    283                         return 1;
    284         }
    285 
    286 
    287         // Pin CPU input buffer if possible
    288         if (cudaHostRegister((void *)in[0], stride[0]*ctx->height, cudaHostRegisterMapped)) {
    289                 pinned_input_buffer = 0;
    290         }
    291                
    292         // Allocate GPU input buffer
    293         if (cudaMalloc((void *)&src, stride[0]*ctx->height)) {
    294                 cuda_err("cudaMalloc input buf");
    295                 goto err0;
    296         }
    297 
    298         // Copy input data to GPU buffer
    299         if (pinned_input_buffer) {
    300                 // Use asynchronous copy if the buffer is pinned
    301                 if (cudaMemcpyAsync(src, in[0], stride[0]*ctx->height, cudaMemcpyHostToDevice, 0)) {
    302                         cuda_err("cudaMemcpyAsync input buf");
    303                         goto err1;
    304                 }
    305         } else {
    306                 if (cudaMemcpy(src, in[0], stride[0]*ctx->height, cudaMemcpyHostToDevice)) {
    307                         cuda_err("cudaMemcpy input buf");
    308                         goto err1;
    309                 }
    310         }
    311 
    312         cudaDeviceSynchronize();
    313        
    314 
    315         // Allocate GPU output buffer
    316         cudaMallocPitch((void *)&gpudst[0], (void *)&out_stride[0], y_width, ctx->height);
    317         cudaMallocPitch((void *)&gpudst[1], (void *)&out_stride[1], uv_width, uv_height);
    318         cudaMallocPitch((void *)&gpudst[2], (void *)&out_stride[2], uv_width, uv_height);
    319 
    320         // Allocate CPU output buffer
    321         out[0] = malloc(out_stride[0] * ctx->height + (out_stride[1] + out_stride[2]) * uv_height);
    322         out[1] = out[0] + out_stride[0] * ctx->height;
    323         out[2] = out[1] + out_stride[1] * uv_height;
    324         printf("instride %d\nCPU input:\t%p\n->GPU input:\t%p\noutstride %d\t%d\t%d\nCPU output:\t%p\t%p\t%p\n->GPU output:\t%p\t%p\t%p\n", stride[0], in[0], src, out_stride[0], out_stride[1], out_stride[2], out[0], out[1], out[2], gpudst[0], gpudst[1], gpudst[2]);
    325        
    326         // Pin output buffer if possible
    327         if (cudaHostRegister((void *)out[0], (out_stride[0] + out_stride[1] + out_stride[2]) * ctx->height, cudaHostRegisterMapped)) {
    328                 pinned_output_buffer = 0;
    329         }
    330 
    331         packed_to_subsampled_planar_func func = NULL;
    332         packed_to_planar_func func2 = NULL;
    333         int err = 0;
    334 
    335         if (ctx->dst_colorspace == YUV444P) {
    336                 func2 = get_conversion_function_444(ctx->src_colorspace, ctx->dst_colorspace);
    337                 if (func2)
    338                         err = func2(src, stride[0], gpudst, out_stride[0], size);
    339                 else goto err2;
    340         } else {
    341                 func = get_conversion_function_subsampled(ctx->src_colorspace, ctx->dst_colorspace);
    342                 if (func)
    343                         err = func(src, stride[0], gpudst, out_stride, size);
    344                 else goto err2;
    345         }
    346 
    347         cudaDeviceSynchronize();
    348         if (err) {
    349                 const char *str = NULL;
    350                 switch (err) {
    351                         case -4: str = "NPP_NULL_POINTER_ERROR"; break;
    352                         case -7: str = "NPP_STEP_ERROR"; break;
    353                         case -8: str = "NPP_ALIGNMENT_ERROR"; break;
    354                         case -19: str = "NPP_NOT_EVEN_STEP_ERROR"; break;
    355                         default:
    356                                           str = "(unknown)";
    357                 }
    358                 fprintf(stderr, "nppiRGBToYCbCr420_8u_C3P3R failed: %d - %s\n", err, str);
    359                 goto err2;
    360         }
    361 
    362         if (pinned_output_buffer) {
    363                 if (cudaMemcpyAsync(out[0], gpudst[0], out_stride[0] * ctx->height, cudaMemcpyDeviceToHost, 0) ||
    364                         cudaMemcpyAsync(out[1], gpudst[1], out_stride[1] * uv_height, cudaMemcpyDeviceToHost, 0) ||
    365                         cudaMemcpyAsync(out[2], gpudst[2], out_stride[2] * uv_height, cudaMemcpyDeviceToHost, 0)) {
    366                         cuda_err("cudaMemcpyAsync output buf");
    367                         goto err2;
    368                 }
    369         } else {
    370                 if (cudaMemcpy(out[0], gpudst[0], out_stride[0] * ctx->height, cudaMemcpyDeviceToHost) ||
    371                         cudaMemcpy(out[1], gpudst[1], out_stride[1] * uv_height, cudaMemcpyDeviceToHost) ||
    372                         cudaMemcpy(out[2], gpudst[2], out_stride[2] * uv_height, cudaMemcpyDeviceToHost)) {
    373                         cuda_err("cudaMemcpy output buf");
    374                         goto err2;
    375                 }
    376         }
    377         cudaDeviceSynchronize();
    378 
    379         // Free GPU output buffer
    380         cudaFree(gpudst[0]);
    381         cudaFree(gpudst[1]);
    382         cudaFree(gpudst[2]);
    383         // Free GPU input buffer
    384         cudaFree(src);
    385 
    386         // Un-pin CPU buffers
    387         if (pinned_input_buffer) {
    388                 cudaHostUnregister((void *)in);
    389         }
    390         if (pinned_output_buffer) {
    391                 cudaHostUnregister((void *)out[0]);
    392         }
    393 
    394         return 0;
    395 
    396 err2:
    397         if (pinned_output_buffer)
    398                 cudaHostUnregister((void *)out[0]);
    399         cudaFree(gpudst[0]);
    400         cudaFree(gpudst[1]);
    401         cudaFree(gpudst[2]);
    402         free(out[0]);
    403 err1:
    404         cudaFree(src);
    405 err0:
    406         if (pinned_input_buffer)
    407                 cudaHostUnregister((void *)in);
    408         return 1;
    409 }
    410 
    411 int free_csc_image(uint8_t *buf[3])
    412 {
    413         free(buf[0]);
    414         return 0;
    415 }
    416 
    417 void free_csc(struct csc_nvcuda_ctx *ctx)
    418 {
    419         return;
    420 }
    421 
    422 const char *get_flags_description(struct csc_nvcuda_ctx *ctx) {
    423         return "";
    424 }
  • xpra/codecs/csc_nvcuda/csc_nvcuda.h

     
    1 /* This file is part of Xpra.
    2  * Copyright (C) 2013 Arthur Huillet
    3  * Xpra is released under the terms of the GNU GPL v2, or, at your option, any
    4  * later version. See the file COPYING for details.
    5  */
    6 
    7 #ifdef _WIN32
    8 #include "stdint.h"
    9 #include "inttypes.h"
    10 #else
    11 #include "stdint.h"
    12 #endif
    13 
    14 #ifdef _WIN32
    15 #define _STDINT_H
    16 #endif
    17 
    18 /** Opaque structure - "context". You must have a context to convert frames */
    19 struct csc_nvcuda_ctx;
    20 
    21 /**
    22  * Initialize Cuda. Call this before doing anything else.
    23  * Will be called by init_csc if you forget.
    24  * @return 0 if OK, non zero on error
    25  */
    26 int init_cuda(void);
    27 
    28 /**
    29  * Return the version of nvidia performance primitives,
    30  * the lib used by csc_nvcuda for CSC.
    31  */
    32 const char *get_NPP_version(void);
    33 
    34 /** Create a CSC context
    35  * @return NULL on error
    36  */
    37 struct csc_nvcuda_ctx *init_csc(int width, int height, const char *src_format_str, const char *dst_format_str);
    38 
    39 /** Free a CSC context */
    40 void free_csc(struct csc_nvcuda_ctx *ctx);
    41 
    42 /** Colorspace conversion.
    43  * Note: you must call free_csc_image() to free the image buffer.
    44  @param in: Input buffer planes.
    45  @param stride: Input strides.
    46  @param out: Array of pointers to be set to point to data planes.
    47  @param out_stride: Array of strides
    48  @return: 0 if OK, 1 on error
    49 */
    50 int csc_image(struct csc_nvcuda_ctx *ctx, const uint8_t *in[3], const int in_stride[3], uint8_t *out[3], int out_stride[3]);
    51 
    52 /**
    53  * Free the output of RGB 2 YUV conversion. You have to pass the pointer to the Y plane. This function will
    54  * free all planes at once.
    55  */
    56 int free_csc_image(uint8_t *buf[3]);
    57 
  • setup.py

     
    477477                   "xpra/codecs/dec_avcodec/constants.pxi",
    478478                   "xpra/codecs/csc_swscale/colorspace_converter.c",
    479479                   "xpra/codecs/csc_swscale/constants.pxi",
    480                    "xpra/codecs/csc_nvcuda/colorspace_converter.c",
    481480                   "xpra/codecs/xor/cyxor.c",
    482481                   "xpra/codecs/argb/argb.c",
    483482                   "xpra/server/stats/cymaths.c",
     
    957956
    958957toggle_packages(nvenc_ENABLED, "xpra.codecs.nvenc")
    959958toggle_packages(csc_opencl_ENABLED, "xpra.codecs.csc_opencl")
     959toggle_packages(csc_nvcuda_ENABLED, "xpra.codecs.csc_nvcuda")
    960960
    961961toggle_packages(enc_x264_ENABLED, "xpra.codecs.enc_x264")
    962962if enc_x264_ENABLED:
     
    997997                ["xpra/codecs/csc_swscale/colorspace_converter.pyx", "xpra/codecs/csc_swscale/csc_swscale.c", "xpra/codecs/memalign/memalign.c"],
    998998                **swscale_pkgconfig), min_version=(0, 19))
    999999
    1000 toggle_packages(csc_nvcuda_ENABLED, "xpra.codecs.csc_nvcuda")
    1001 if csc_nvcuda_ENABLED:
    1002     cuda_pkgconfig = pkgconfig("cuda")
    1003     cython_add(Extension("xpra.codecs.csc_nvcuda.colorspace_converter",
    1004                 ["xpra/codecs/csc_nvcuda/colorspace_converter.pyx", "xpra/codecs/csc_nvcuda/csc_nvcuda.c"],
    1005                 **cuda_pkgconfig), min_version=(0, 16))
    1006 
    10071000toggle_packages(vpx_ENABLED, "xpra.codecs.vpx")
    10081001if vpx_ENABLED:
    10091002    if vpx_static_ENABLED: