xpra icon
Bug tracker and wiki

This bug tracker and wiki are being discontinued
please use https://github.com/Xpra-org/xpra instead.


Ticket #384: csc_nvcuda-v8.patch

File csc_nvcuda-v8.patch, 50.8 KB (added by Antoine Martin, 8 years ago)

works both ways now

  • xpra/codecs/csc_nvcuda/colorspace_converter.py

     
    11# This file is part of Xpra.
    2 # Copyright (C) 2013 Arthur Huillet
    32# Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk>
    43# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
    54# later version. See the file COPYING for details.
    65
    7 import time
     6
     7from xpra.codecs.image_wrapper import ImageWrapper
    88from xpra.codecs.codec_constants import codec_spec, get_subsampling_divs
    9 from xpra.codecs.image_wrapper import ImageWrapper
     9from xpra.log import Logger, debug_if_env
     10log = Logger()
     11debug = debug_if_env(log, "XPRA_CUDA_DEBUG")
     12error = log.error
    1013
    11 cdef extern from "stdlib.h":
    12     void free(void *ptr)
     14import numpy
     15import time
     16import ctypes
     17import sys
     18assert bytearray
     19import pycuda               #@UnresolvedImport
     20from pycuda import driver   #@UnresolvedImport
     21driver.init()
    1322
    14 cdef extern from "Python.h":
    15     ctypedef int Py_ssize_t
    16     ctypedef object PyObject
    17     object PyBuffer_FromMemory(void *ptr, Py_ssize_t size)
    18     int PyObject_AsReadBuffer(object obj, void ** buffer, Py_ssize_t * buffer_len) except -1
     23log.info("PyCUDA version=%s", ".".join([str(x) for x in driver.get_version()]))
     24log.info("PyCUDA driver version=%s", driver.get_driver_version())
    1925
    20 ctypedef unsigned char uint8_t
    21 ctypedef void csc_nvcuda_ctx
    22 cdef extern from "csc_nvcuda.h":
    23 #char **get_supported_colorspaces()
     26ngpus = driver.Device.count()
     27log.info("PyCUDA found %s devices:", ngpus)
     28selected_device = None
     29for i in range(ngpus):
     30    d = driver.Device(i)
     31    attr = d.get_attributes()
     32    debug("compute_capability=%s, attributes=%s", d.compute_capability(), attr)
     33    host_mem = d.get_attribute(driver.device_attribute.CAN_MAP_HOST_MEMORY)
     34    debug("CAN_MAP_HOST_MEMORY=%s", host_mem)
     35    pre = "-"
     36    if host_mem:
     37        pre = "+"
     38    log.info(" %s %s @ %s (%sMB)", pre, d.name(), d.pci_bus_id(), int(d.total_memory()/1024/1024))
     39    if host_mem and selected_device is None:
     40        selected_device = d
     41assert selected_device is not None
     42context = selected_device.make_context(flags=driver.ctx_flags.SCHED_YIELD | driver.ctx_flags.MAP_HOST)
     43debug("testing with context=%s", context)
     44debug("api version=%s", context.get_api_version())
     45free, total = driver.mem_get_info()
     46debug("using device %s, memory: free=%sMB, total=%sMB",  selected_device, int(free/1024/1024), int(total/1024/1024))
     47context.pop()
     48#ensure we cleanup:
     49class CudaContextWrapper(object):
    2450
    25     int init_cuda()
    26     csc_nvcuda_ctx *init_csc(int src_width, int src_height, const char *src_format, const char *dst_format)
    27     void free_csc(csc_nvcuda_ctx *ctx)
    28     int csc_image(csc_nvcuda_ctx *ctx, const uint8_t *input_image[3], const int in_stride[3], uint8_t *out[3], int out_stride[3]) nogil
    29     void free_csc_image(uint8_t *buf[3])
    30     char *get_NPP_version()
     51    def __init__(self, context):
     52        self.context = context
    3153
     54    def __del__(self):
     55        self.context.detach()
     56        self.context = None
     57ccw = CudaContextWrapper(context)
    3258
    33 _init = None
    34 def init():
    35     global _init
    36     if _init is None:
    37         _init = init_cuda()==0
    38     return _init
    39 init()
    4059
    41 COLORSPACES_SRC = [ "RGB", "RGBA", "BGR", "BGRX" ]
    42 COLORSPACES_DST = [ "YUV420P", "YUV422P", "YUV444P" ]
     60def find_lib(basename):
     61    try:
     62        if sys.platform == "win32":
     63            libname = basename+".dll"
     64        else:
     65            libname = basename+".so"
     66        return ctypes.cdll.LoadLibrary(libname)
     67    except Exception, e:
     68        debug("could not find %s: %s", basename, e)
     69        return None
     70   
     71_NPP_LIBRARY_NAMES = ["libnppi",    #CUDA5.5
     72                      "libnpp"]     #CUDA5.0
     73_NPP_LIBRARIES = []
     74for name in _NPP_LIBRARY_NAMES:
     75    lib = find_lib(name)
     76    if lib:
     77        _NPP_LIBRARIES.append(lib)
     78if len(_NPP_LIBRARIES)==0:
     79    raise ImportError("failed to load npp library - check your library path")
    4380
     81
     82class NppiSize(ctypes.Structure):
     83    _fields_ = [("width", ctypes.c_int),
     84                ("height", ctypes.c_int)]
     85
     86def Npp8u_p(buf):
     87    return ctypes.cast(int(buf), ctypes.c_void_p)
     88
     89RGB_to_YUV444P_argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, ctypes.c_int, NppiSize]
     90RGB_to_YUV42xP_argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, (ctypes.c_int)*3, NppiSize]
     91
     92YUV444P_to_RGB_argtypes = [(ctypes.c_void_p)*3, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, NppiSize]
     93YUV42xP_to_RGB_argtypes = [(ctypes.c_void_p)*3, (ctypes.c_int)*3, ctypes.c_void_p, ctypes.c_int, NppiSize]
     94CONSTANT_ALPHA = ctypes.c_uint8
     95
     96
     97COLORSPACES_MAP_STR  = {
     98        ("RGBX",    "YUV444P")  : ("nppiRGBToYCbCr_8u_C3P3R",       RGB_to_YUV444P_argtypes),
     99        ("RGBA",    "YUV444P")  : ("nppiRGBToYCbCr_8u_AC4P3R",      RGB_to_YUV444P_argtypes),
     100        ("YUV444P", "RGB")      : ("nppiYCbCrToRGB_8u_P3C3R",       YUV444P_to_RGB_argtypes),
     101        ("YUV444P", "BGR")      : ("nppiYCbCrToBGR_8u_P3C3R",       YUV444P_to_RGB_argtypes),
     102        ("YUV444P", "RGBX")     : ("nppiYCbCrToRGB_8u_P3C4R",       YUV444P_to_RGB_argtypes+[CONSTANT_ALPHA]),
     103        ("YUV444P", "BGRX")     : ("nppiYCbCrToBGR_8u_P3C4R",       YUV444P_to_RGB_argtypes+[CONSTANT_ALPHA]),
     104        #BGR / BGRA: need nppiSwap(Channels before one of the above
     105        ("RGBX",    "YUV422P")  : ("nppiRGBToYCbCr422_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
     106        ("BGRX",    "YUV422P")  : ("nppiBGRToYCbCr422_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
     107        ("BGRA",    "YUV422P")  : ("nppiBGRToYCbCr422_8u_AC4P3R",   RGB_to_YUV42xP_argtypes),
     108        ("YUV422P", "RGB")      : ("nppiYCbCr422ToRGB_8u_P3C3R",    YUV42xP_to_RGB_argtypes),
     109        ("YUV422P", "BGR")      : ("nppiYCbCr422ToBGR_8u_P3C3R",    YUV42xP_to_RGB_argtypes),
     110        #YUV420P:
     111        ("RGBX",    "YUV420P")  : ("nppiRGBToYCbCr420_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
     112        ("BGRX",    "YUV420P")  : ("nppiBGRToYCbCr420_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
     113        ("RGBA",    "YUV420P")  : ("nppiRGBToYCrCb420_8u_AC4P3R",   RGB_to_YUV42xP_argtypes),
     114        ("BGRA",    "YUV420P")  : ("nppiBGRToYCbCr420_8u_AC4P3R",   RGB_to_YUV42xP_argtypes),
     115        ("YUV420P", "RGB")      : ("nppiYCbCr420ToRGB_8u_P3C3R",    YUV42xP_to_RGB_argtypes),
     116        ("YUV420P", "BGR")      : ("nppiYCbCr420ToBGR_8u_P3C3R",    YUV42xP_to_RGB_argtypes),
     117        ("YUV420P", "RGBX")     : ("nppiYCrCb420ToRGB_8u_P3C4R",    YUV42xP_to_RGB_argtypes),
     118        ("YUV420P", "BGRX")     : ("nppiYCbCr420ToBGR_8u_P3C4R",    YUV42xP_to_RGB_argtypes),
     119        }
     120#ie:
     121#BGR to YUV420P:
     122#NppStatus nppiBGRToYCbCr420_8u_C3P3R (const Npp8u *pSrc, int nSrcStep, Npp8u *pDst[3], int rDstStep[3], NppiSize oSizeROI)
     123#pSrc Source-Image Pointer.
     124#nSrcStep Source-Image Line Step.
     125#pDst Destination-Planar-Image Pointer Array.
     126#rDstStep Destination-Planar-Image Line Step Array.
     127#oSizeROI Region-of-Interest (ROI). (struct with width and height)
     128#Returns:
     129#Image Data Related Error Codes, ROI Related Error Codes
     130
     131#For YUV444P:
     132#NppStatus nppiRGBToYCbCr_8u_C3P3R(const Npp8u * pSrc, int nSrcStep, Npp8u * pDst[3], int nDstStep, NppiSize oSizeROI);
     133#(only one nDstStep!
     134
     135#YUV420P to RGB:
     136#NppStatus nppiYCbCrToRGB_8u_P3C3R(const Npp8u * const pSrc[3], int nSrcStep, Npp8u * pDst, int nDstStep, NppiSize oSizeROI);
     137#YUV444P to RGB:
     138#NppStatus nppiYCrCb420ToRGB_8u_P3C4R(const Npp8u * const pSrc[3],int rSrcStep[3], Npp8u * pDst, int nDstStep, NppiSize oSizeROI);
     139#Those with alpha add:
     140#Npp8u nAval
     141
     142
     143NPP_NO_OPERATION_WARNING = 1
     144NPP_DIVIDE_BY_ZERO_WARNING = 6
     145NPP_AFFINE_QUAD_INCORRECT_WARNING = 28
     146NPP_WRONG_INTERSECTION_ROI_WARNING = 29
     147NPP_WRONG_INTERSECTION_QUAD_WARNING = 30
     148NPP_DOUBLE_SIZE_WARNING = 35
     149NPP_MISALIGNED_DST_ROI_WARNING = 10000
     150
     151WARNINGS = {
     152    NPP_NO_OPERATION_WARNING :  "Indicates that no operation was performed",
     153    NPP_DIVIDE_BY_ZERO_WARNING: "Divisor is zero however does not terminate the execution",
     154    NPP_AFFINE_QUAD_INCORRECT_WARNING:  "Indicates that the quadrangle passed to one of affine warping functions doesn't have necessary properties. First 3 vertices are used, the fourth vertex discarded",
     155    NPP_WRONG_INTERSECTION_ROI_WARNING: "The given ROI has no interestion with either the source or destination ROI. Thus no operation was performed",
     156    NPP_WRONG_INTERSECTION_QUAD_WARNING:"The given quadrangle has no intersection with either the source or destination ROI. Thus no operation was performed",
     157    NPP_DOUBLE_SIZE_WARNING:    "Image size isn't multiple of two. Indicates that in case of 422/411/420 sampling the ROI width/height was modified for proper processing",
     158    NPP_MISALIGNED_DST_ROI_WARNING: "Speed reduction due to uncoalesced memory accesses warning"
     159    }
     160NPP_STEP_ERROR = -14
     161NPP_NOT_EVEN_STEP_ERROR = -108
     162
     163ERRORS = {
     164    NPP_STEP_ERROR : "Step is less or equal zero",
     165    NPP_NOT_EVEN_STEP_ERROR :   "Step value is not pixel multiple",
     166          }
     167
     168
     169YUV_INDEX_TO_PLANE = {
     170                      0 : "Y",
     171                      1 : "U",
     172                      2 : "V"
     173                      }
     174
     175
     176def roundup(n, m):
     177    return (n + m - 1) & ~(m - 1)
     178
     179
     180COLORSPACES_MAP = {}
     181for k, f_def in COLORSPACES_MAP_STR.items():
     182    fn, argtypes = f_def
     183    try:
     184        for lib in _NPP_LIBRARIES:
     185            if hasattr(lib, fn):
     186                cfn = getattr(lib, fn)
     187                debug("found %s for %s in %s: %s", fn, k, lib, cfn)
     188                COLORSPACES_MAP[k] = (fn, cfn)
     189                #set argument types and return type:
     190                cfn.restype = ctypes.c_int
     191                cfn.argtypes = argtypes
     192    except:
     193        log.error("could not load '%s', conversion disabled: %s", fn, k)
     194
     195
     196def get_type():
     197    return "nvcuda"
     198
    44199def get_version():
    45     return get_NPP_version()
     200    return pycuda.VERSION_TEXT
    46201
    47202def get_input_colorspaces():
    48     if not init():
    49         return []
    50     return COLORSPACES_SRC
     203    return sorted(set([src for src, _ in COLORSPACES_MAP.keys()]))
    51204
    52205def get_output_colorspaces(input_colorspace):
    53     if not init():
    54         return []
    55     #exclude input colorspace:
    56     return COLORSPACES_DST
     206    return sorted(set(dst for src,dst in COLORSPACES_MAP.keys() if src==input_colorspace))
    57207
     208def validate_in_out(in_colorspace, out_colorspace):
     209    assert in_colorspace in get_input_colorspaces(), "invalid input colorspace: %s (must be one of %s)" % (in_colorspace, get_input_colorspaces())
     210    assert out_colorspace in get_output_colorspaces(in_colorspace), "invalid output colorspace: %s (must be one of %s for input %s)" % (out_colorspace, get_output_colorspaces(in_colorspace), in_colorspace)
     211
    58212def get_spec(in_colorspace, out_colorspace):
    59     assert init(), "nvcuda is not available!"
    60     assert in_colorspace in COLORSPACES_SRC, "invalid input colorspace: %s (must be one of %s)" % (in_colorspace, COLORSPACES_SRC)
    61     assert out_colorspace in COLORSPACES_DST, "invalid output colorspace: %s (must be one of %s)" % (out_colorspace, COLORSPACES_DST)
     213    validate_in_out(in_colorspace, out_colorspace)
    62214    #ratings: quality, speed, setup cost, cpu cost, gpu cost, latency, max_w, max_h, max_pixels
    63215    return codec_spec(ColorspaceConverter, speed=100, setup_cost=10, cpu_cost=10, gpu_cost=50, min_w=16, min_h=16, can_scale=False)
    64216
    65217
    66 cdef class CSCImage:
    67     """
    68         Allows us to call free_csc_image
    69         when this object is garbage collected
    70     """
    71     cdef uint8_t *buf[3]
    72     cdef int freed
     218class ColorspaceConverter(object):
    73219
    74     cdef set_plane(self, int plane, uint8_t *buf):
    75         assert plane in (0, 1, 2)
    76         self.buf[plane] = buf
     220    def __init__(self):
     221        self.src_width = 0
     222        self.src_height = 0
     223        self.src_format = ""
     224        self.dst_width = 0
     225        self.dst_height = 0
     226        self.dst_format = ""
     227        self.time = 0
     228        self.frames = 0
     229        self.kernel_function = None
    77230
    78     def __dealloc__(self):
    79         #print("CSCImage.__dealloc__() calling free()")
    80         self.free()
    81 
    82     def free(self):
    83         #print("CSCImage.free() free_csc_image(..) already? %s" % self.freed)
    84         if self.freed==0:
    85             self.freed = 1
    86             free_csc_image(self.buf)
    87 
    88 
    89 class CSCImageWrapper(ImageWrapper):
    90 
    91     def free(self):                             #@DuplicatedSignature
    92         #print("YUVImageWrapper.free() csc_image=%s" % self.csc_image)
    93         if self.csc_image:
    94             self.csc_image.free()
    95             self.csc_image = None
    96 
    97 cdef class ColorspaceConverter:
    98     cdef int frames
    99     cdef csc_nvcuda_ctx *context
    100     cdef int src_width
    101     cdef int src_height
    102     cdef char* src_format
    103     cdef int dst_width
    104     cdef int dst_height
    105     cdef char* dst_format
    106     cdef double time
    107 
    108     def init_context(self, int src_width, int src_height, src_format,
    109                            int dst_width, int dst_height, dst_format, int speed=100):    #@DuplicatedSignature
     231    def init_context(self, src_width, src_height, src_format,
     232                           dst_width, dst_height, dst_format):    #@DuplicatedSignature
     233        validate_in_out(src_format, dst_format)
    110234        self.src_width = src_width
    111235        self.src_height = src_height
     236        self.src_format = src_format
    112237        self.dst_width = dst_width
    113238        self.dst_height = dst_height
    114         self.time = 0
    115         #ugly trick to use a string which won't go away from underneath us:
    116         assert src_format in COLORSPACES_SRC, "invalid source format: %s" % src_format
    117         for x in COLORSPACES_SRC:
    118             if x==src_format:
    119                 self.src_format = x
    120                 break
    121         assert dst_format in COLORSPACES_DST, "invalid destination format: %s" % dst_format
    122         for x in COLORSPACES_DST:
    123             if x==dst_format:
    124                 self.dst_format = x
    125                 break
    126         self.frames = 0
    127         self.context = init_csc(self.src_width, self.src_height, self.src_format, self.dst_format)
     239        self.dst_format = dst_format
     240        context.push()
     241        context.synchronize()
     242        k = (src_format, dst_format)
     243        npp_fn = COLORSPACES_MAP.get(k)
     244        assert npp_fn is not None, "invalid pair: %s" % k
     245        self.kernel_function_name, cfn = npp_fn
     246        debug("init_context%s npp conversion function=%s (%s)", (src_width, src_height, src_format, dst_width, dst_height, dst_format), self.kernel_function_name, cfn)
     247        self.kernel_function = cfn
     248        if src_format.find("YUV")>=0:
     249            self.convert_image = self.convert_image_yuv
     250        else:
     251            self.convert_image = self.convert_image_rgb
     252        debug("init_context(..) convert_image=%s", self.convert_image)
    128253
    129254    def get_info(self):
    130255        info = {"frames"    : self.frames,
     
    141266        return info
    142267
    143268    def __str__(self):
    144         return "nvcuda(%s %sx%s - %s %sx%s)" % (self.src_format, self.src_width, self.src_height,
     269        if self.queue is None:
     270            return "opencl(uninitialized)"
     271        return "opencl(%s %sx%s - %s %sx%s)" % (self.src_format, self.src_width, self.src_height,
    145272                                                 self.dst_format, self.dst_width, self.dst_height)
    146273
    147     def __dealloc__(self):                  #@DuplicatedSignature
     274    def is_closed(self):
     275        return False
     276
     277    def __del__(self):                  #@DuplicatedSignature
    148278        self.clean()
    149279
    150280    def get_src_width(self):
     
    166296        return self.dst_format
    167297
    168298    def get_type(self):
    169         return  "nvcuda"
     299        return  "opencl"
    170300
    171301
    172302    def clean(self):                        #@DuplicatedSignature
    173         if self.context!=NULL:
    174             free_csc(self.context)
    175             free(self.context)
    176             self.context = NULL
     303        if context:
     304            context.pop()
    177305
    178306    def convert_image(self, image):
    179         cdef Py_ssize_t pic_buf_len = 0
    180         assert self.context!=NULL
    181         cdef const uint8_t *input_image[3]
    182         cdef uint8_t *output_image[3]
    183         cdef int input_stride[3]
    184         cdef int output_stride[3]
    185         cdef int planes
    186         cdef int i                          #@DuplicatedSignature
    187         cdef int height
    188         cdef int stride
    189         cdef int result
    190         planes = image.get_planes()
    191         assert planes in (0, 1, 3), "invalid number of planes: %s" % planes
    192         input = image.get_pixels()
     307        #we override this method during init_context
     308        raise Exception("not initialized!")
     309
     310
     311    def convert_image_yuv(self, image):
     312        global program
     313        iplanes = image.get_planes()
     314        width = image.get_width()
     315        height = image.get_height()
    193316        strides = image.get_rowstride()
    194         if planes==0:
    195             #magic: if planes==0, this is an XImageWrapper... with raw pixels/rowstride
    196             input = [input]
    197             strides = [strides]
    198             planes = 1
    199         #print("convert_image(%s) input=%s, strides=%s" % (image, len(input), strides))
    200         assert len(input)==planes, "expected %s planes but found %s" % (planes, len(input))
    201         assert len(strides)==planes, "expected %s rowstrides but found %s" % (planes, len(strides))
    202         for i in range(planes):
    203             input_stride[i] = strides[i]
    204             PyObject_AsReadBuffer(input[i], <const void**> &input_image[i], &pic_buf_len)
    205         start = time.time()
    206         with nogil:
    207             result = csc_image(self.context, input_image, input_stride, output_image, output_stride)
    208         if result != 0:
     317        pixels = image.get_pixels()
     318        debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels))
     319        assert iplanes==ImageWrapper._3_PLANES, "must use planar YUV as input"
     320        assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format)
     321        assert len(strides)==len(pixels)==3, "invalid number of planes (%s) or strides (%s), should be 3" % (len(strides), len(pixels))
     322
     323        #YUV444P argtypes = [(ctypes.c_void_p)*3, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, NppiSize]
     324        #YUV42xP argtypes = [(ctypes.c_void_p)*3, (ctypes.c_int)*3, ctypes.c_void_p, ctypes.c_int, NppiSize]
     325        in_t = self.kernel_function.argtypes[0]            #always: (ctypes.c_void_p)*3
     326        in_strides_t = self.kernel_function.argtypes[1]    #(ctypes.c_int)*3 OR ctypes.c_int
     327
     328        divs = get_subsampling_divs(self.src_format)
     329
     330        #copy each plane to GPU:
     331        upload_start = time.time()
     332        in_bufs = []
     333        in_strides = strides
     334        in_sizes = []
     335        for i in range(3):
     336            x_div, y_div = divs[i]
     337            in_stride = strides[i]
     338            assert in_stride >= width/x_div, \
     339                "invalid stride %s is smaller than plane %s width %s/%s" % (in_stride, YUV_INDEX_TO_PLANE.get(i, i), width, x_div)
     340            in_height = height/y_div
     341            plane = pixels[i]
     342            in_size = in_stride*in_height
     343            in_sizes.append(in_size)
     344            assert len(plane)>=in_size
     345            in_buf = driver.to_device(plane)
     346            in_bufs.append(in_buf)
     347        upload_end = time.time()
     348        debug("%s pixels now on GPU at %s, took %.1fms", in_sizes, in_bufs, upload_end-upload_start)
     349        if in_strides_t==ctypes.c_int:
     350            #one stride for all planes (this must be YUV444P)
     351            assert len(set(in_strides))==1, "expected only one stride: %s" % str(in_strides)
     352            in_strides = [in_strides[0]]
     353
     354        #allocate output rgb buffer on CPU:
     355        out_stride = width*4
     356        out_buf, out_stride = driver.mem_alloc_pitch(out_stride, height, 4)
     357        src = in_t(*[Npp8u_p(in_buf) for in_buf in in_bufs])
     358        debug("in_strides=%s, out_stride=%s", in_strides, out_stride)
     359        #NppStatus nppiYCrCb420ToRGB_8u_P3C4R(const Npp8u * const pSrc[3],int rSrcStep[3], Npp8u * pDst, int nDstStep, NppiSize oSizeROI);
     360        args = [src, in_strides_t(*in_strides), Npp8u_p(out_buf), ctypes.c_int(out_stride), NppiSize(width, height)]
     361        debug("last arg: %s", self.kernel_function.argtypes[-1])
     362        if self.kernel_function.argtypes[-1]==CONSTANT_ALPHA:
     363            #add hardcoded constant alpha:
     364            args.append(ctypes.c_uint8(255))
     365        debug("calling %s%s", self.kernel_function_name, tuple(args))
     366        kstart = time.time()
     367        v = self.kernel_function(*args)
     368        if v<0:
     369            log.error("%s%s returned an error: %s", self.kernel_function_name, args, ERRORS.get(v, v))
    209370            return None
    210         end = time.time()
    211         self.time += (end-start)
     371        elif v>0:
     372            #positive return-codes indicate warnings:
     373            warning = WARNINGS.get(v, "unknown")
     374            log.warn("%s returned a warning %s: %s", self.kernel_function_name, v, warning)
     375        kend = time.time()
     376        debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0)
     377
     378        for in_buf in in_bufs:
     379            in_buf.free()
    212380        self.frames += 1
    213         #now parse the output:
    214         csci = CSCImage()           #keep a reference to memory for cleanup
    215         if self.dst_format.endswith("P"):
    216             nplanes = 3
    217             divs = get_subsampling_divs(self.dst_format)
    218             #print("convert_image(%s) nplanes=%s, divs=%s" % (image, nplanes, divs))
    219             out = []
    220             strides = []
    221             for i in range(nplanes):
    222                 _, dy = divs[i]
    223                 if dy==1:
    224                     height = self.dst_height
    225                 elif dy==2:
    226                     height = (self.dst_height+1)>>1
    227                 else:
    228                     raise Exception("invalid height divisor %s" % dy)
    229                 stride = output_stride[i]
    230                 if stride>0 and output_image[i]!=NULL:
    231                     plane = PyBuffer_FromMemory(<void *>output_image[i], height * stride)
    232                 else:
    233                     stride = 0
    234                     plane = None
    235                 csci.set_plane(i, output_image[i])
    236                 out.append(plane)
    237                 strides.append(stride)
    238         else:
    239             nplanes = 0
    240             strides = output_stride[0]
    241             out = PyBuffer_FromMemory(<void *>output_image[0], self.dst_height * strides)
    242             csci.set_plane(0, output_image[0])
    243         out_image = CSCImageWrapper(0, 0, self.dst_width, self.dst_height, out, self.dst_format, 24, strides, nplanes)
    244         out_image.csc_image = csci
    245         return out_image
     381        read_start = time.time()
     382        pixels = driver.aligned_empty(out_stride*height, dtype=numpy.byte)
     383        driver.memcpy_dtoh(pixels, out_buf)
     384        out_buf.free()
     385        context.synchronize()
     386        read_end = time.time()
     387        debug("read back took %.1fms", (read_end-read_start)*1000.0)
     388        return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels.data, self.dst_format, 24, out_stride, planes=ImageWrapper.PACKED_RGB)
     389
     390
     391    def convert_image_rgb(self, image):
     392        global program
     393        iplanes = image.get_planes()
     394        width = image.get_width()
     395        height = image.get_height()
     396        stride = image.get_rowstride()
     397        size = image.get_size()
     398        pixels = image.get_pixels()
     399        debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels))
     400        assert iplanes==ImageWrapper.PACKED_RGB, "must use packed rgb as input"
     401        assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format)
     402
     403        divs = get_subsampling_divs(self.dst_format)
     404
     405        #copy packed rgb pixels to GPU:
     406        upload_start = time.time()
     407        gpu_image = driver.to_device(pixels)
     408        upload_end = time.time()
     409        debug("%s pixels now on GPU at %s, took %.1fms", size, gpu_image, upload_end-upload_start)
     410
     411        #YUV444P argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, ctypes.c_int, NppiSize]
     412        #YUV42xP argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, (ctypes.c_int)*3, NppiSize]
     413        out_t = self.kernel_function.argtypes[2]            #always: (ctypes.c_void_p)*3
     414        out_strides_t = self.kernel_function.argtypes[3]    #(ctypes.c_int)*3 OR ctypes.c_int
     415        out_bufs = []
     416        out_strides = []
     417        out_sizes = []
     418        for i in range(3):
     419            x_div, y_div = divs[i]
     420            out_stride = roundup(width/x_div, 4)
     421            out_height = roundup(height/y_div, 2)
     422            #mem_alloc_pitch returns the actual outstride used:
     423            out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4)
     424            out_bufs.append(out_buf)
     425            out_strides.append(out_stride)
     426            out_sizes.append((out_stride, out_height))
     427        dest = out_t(*[ctypes.cast(int(out_buf), ctypes.c_void_p) for out_buf in out_bufs])
     428        if out_strides_t==ctypes.c_int:
     429            #one stride for all planes (this must be YUV444P)
     430            assert len(set(out_strides))==1, "more than one stride where only one expected in: %s" % out_strides
     431            out_strides = [out_strides[0]]
     432        args = [Npp8u_p(gpu_image), ctypes.c_int(stride), dest, out_strides_t(*out_strides), NppiSize(width, height)]
     433        debug("calling %s%s", self.kernel_function_name, tuple(args))
     434        kstart = time.time()
     435        v = self.kernel_function(*args)
     436        if v<0:
     437            log.error("%s%s returned an error: %s", self.kernel_function_name, args, ERRORS.get(v, v))
     438            return None
     439        elif v>0:
     440            #positive return-codes indicate warnings:
     441            warning = WARNINGS.get(v, "unknown")
     442            log.warn("%s returned a warning %s: %s", self.kernel_function_name, v, warning)
     443        kend = time.time()
     444        debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0)
     445        gpu_image.free()
     446        self.frames += 1
     447        read_start = time.time()
     448        pixels = []
     449        for i in range(3):
     450            plane = driver.aligned_empty(out_sizes[i], dtype=numpy.byte)
     451            driver.memcpy_dtoh(plane, out_bufs[i])
     452            out_bufs[i].free()
     453            pixels.append(plane.data)
     454        context.synchronize()
     455        read_end = time.time()
     456        debug("read back took %.1fms", (read_end-read_start)*1000.0)
     457        return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, out_strides, planes=ImageWrapper._3_PLANES)
  • xpra/codecs/csc_nvcuda/colorspace_converter.pyx

     
    1 # This file is part of Xpra.
    2 # Copyright (C) 2013 Arthur Huillet
    3 # Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk>
    4 # Xpra is released under the terms of the GNU GPL v2, or, at your option, any
    5 # later version. See the file COPYING for details.
    6 
    7 import time
    8 from xpra.codecs.codec_constants import codec_spec, get_subsampling_divs
    9 from xpra.codecs.image_wrapper import ImageWrapper
    10 
    11 cdef extern from "stdlib.h":
    12     void free(void *ptr)
    13 
    14 cdef extern from "Python.h":
    15     ctypedef int Py_ssize_t
    16     ctypedef object PyObject
    17     object PyBuffer_FromMemory(void *ptr, Py_ssize_t size)
    18     int PyObject_AsReadBuffer(object obj, void ** buffer, Py_ssize_t * buffer_len) except -1
    19 
    20 ctypedef unsigned char uint8_t
    21 ctypedef void csc_nvcuda_ctx
    22 cdef extern from "csc_nvcuda.h":
    23 #char **get_supported_colorspaces()
    24 
    25     int init_cuda()
    26     csc_nvcuda_ctx *init_csc(int src_width, int src_height, const char *src_format, const char *dst_format)
    27     void free_csc(csc_nvcuda_ctx *ctx)
    28     int csc_image(csc_nvcuda_ctx *ctx, const uint8_t *input_image[3], const int in_stride[3], uint8_t *out[3], int out_stride[3]) nogil
    29     void free_csc_image(uint8_t *buf[3])
    30     char *get_NPP_version()
    31 
    32 
    33 _init = None
    34 def init():
    35     global _init
    36     if _init is None:
    37         _init = init_cuda()==0
    38     return _init
    39 init()
    40 
    41 COLORSPACES_SRC = [ "RGB", "RGBA", "BGR", "BGRX" ]
    42 COLORSPACES_DST = [ "YUV420P", "YUV422P", "YUV444P" ]
    43 
    44 def get_version():
    45     return get_NPP_version()
    46 
    47 def get_input_colorspaces():
    48     if not init():
    49         return []
    50     return COLORSPACES_SRC
    51 
    52 def get_output_colorspaces(input_colorspace):
    53     if not init():
    54         return []
    55     #exclude input colorspace:
    56     return COLORSPACES_DST
    57 
    58 def get_spec(in_colorspace, out_colorspace):
    59     assert init(), "nvcuda is not available!"
    60     assert in_colorspace in COLORSPACES_SRC, "invalid input colorspace: %s (must be one of %s)" % (in_colorspace, COLORSPACES_SRC)
    61     assert out_colorspace in COLORSPACES_DST, "invalid output colorspace: %s (must be one of %s)" % (out_colorspace, COLORSPACES_DST)
    62     #ratings: quality, speed, setup cost, cpu cost, gpu cost, latency, max_w, max_h, max_pixels
    63     return codec_spec(ColorspaceConverter, speed=100, setup_cost=10, cpu_cost=10, gpu_cost=50, min_w=16, min_h=16, can_scale=False)
    64 
    65 
    66 cdef class CSCImage:
    67     """
    68         Allows us to call free_csc_image
    69         when this object is garbage collected
    70     """
    71     cdef uint8_t *buf[3]
    72     cdef int freed
    73 
    74     cdef set_plane(self, int plane, uint8_t *buf):
    75         assert plane in (0, 1, 2)
    76         self.buf[plane] = buf
    77 
    78     def __dealloc__(self):
    79         #print("CSCImage.__dealloc__() calling free()")
    80         self.free()
    81 
    82     def free(self):
    83         #print("CSCImage.free() free_csc_image(..) already? %s" % self.freed)
    84         if self.freed==0:
    85             self.freed = 1
    86             free_csc_image(self.buf)
    87 
    88 
    89 class CSCImageWrapper(ImageWrapper):
    90 
    91     def free(self):                             #@DuplicatedSignature
    92         #print("YUVImageWrapper.free() csc_image=%s" % self.csc_image)
    93         if self.csc_image:
    94             self.csc_image.free()
    95             self.csc_image = None
    96 
    97 cdef class ColorspaceConverter:
    98     cdef int frames
    99     cdef csc_nvcuda_ctx *context
    100     cdef int src_width
    101     cdef int src_height
    102     cdef char* src_format
    103     cdef int dst_width
    104     cdef int dst_height
    105     cdef char* dst_format
    106     cdef double time
    107 
    108     def init_context(self, int src_width, int src_height, src_format,
    109                            int dst_width, int dst_height, dst_format, int speed=100):    #@DuplicatedSignature
    110         self.src_width = src_width
    111         self.src_height = src_height
    112         self.dst_width = dst_width
    113         self.dst_height = dst_height
    114         self.time = 0
    115         #ugly trick to use a string which won't go away from underneath us:
    116         assert src_format in COLORSPACES_SRC, "invalid source format: %s" % src_format
    117         for x in COLORSPACES_SRC:
    118             if x==src_format:
    119                 self.src_format = x
    120                 break
    121         assert dst_format in COLORSPACES_DST, "invalid destination format: %s" % dst_format
    122         for x in COLORSPACES_DST:
    123             if x==dst_format:
    124                 self.dst_format = x
    125                 break
    126         self.frames = 0
    127         self.context = init_csc(self.src_width, self.src_height, self.src_format, self.dst_format)
    128 
    129     def get_info(self):
    130         info = {"frames"    : self.frames,
    131                 "src_width" : self.src_width,
    132                 "src_height": self.src_height,
    133                 "src_format": self.src_format,
    134                 "dst_width" : self.dst_width,
    135                 "dst_height": self.dst_height,
    136                 "dst_format": self.dst_format}
    137         if self.frames>0 and self.time>0:
    138             pps = float(self.src_width) * float(self.src_height) * float(self.frames) / self.time
    139             info["total_time_ms"] = int(self.time*1000.0)
    140             info["pixels_per_second"] = int(pps)
    141         return info
    142 
    143     def __str__(self):
    144         return "nvcuda(%s %sx%s - %s %sx%s)" % (self.src_format, self.src_width, self.src_height,
    145                                                  self.dst_format, self.dst_width, self.dst_height)
    146 
    147     def __dealloc__(self):                  #@DuplicatedSignature
    148         self.clean()
    149 
    150     def get_src_width(self):
    151         return self.src_width
    152 
    153     def get_src_height(self):
    154         return self.src_height
    155 
    156     def get_src_format(self):
    157         return self.src_format
    158 
    159     def get_dst_width(self):
    160         return self.dst_width
    161 
    162     def get_dst_height(self):
    163         return self.dst_height
    164 
    165     def get_dst_format(self):
    166         return self.dst_format
    167 
    168     def get_type(self):
    169         return  "nvcuda"
    170 
    171 
    172     def clean(self):                        #@DuplicatedSignature
    173         if self.context!=NULL:
    174             free_csc(self.context)
    175             free(self.context)
    176             self.context = NULL
    177 
    178     def convert_image(self, image):
    179         cdef Py_ssize_t pic_buf_len = 0
    180         assert self.context!=NULL
    181         cdef const uint8_t *input_image[3]
    182         cdef uint8_t *output_image[3]
    183         cdef int input_stride[3]
    184         cdef int output_stride[3]
    185         cdef int planes
    186         cdef int i                          #@DuplicatedSignature
    187         cdef int height
    188         cdef int stride
    189         cdef int result
    190         planes = image.get_planes()
    191         assert planes in (0, 1, 3), "invalid number of planes: %s" % planes
    192         input = image.get_pixels()
    193         strides = image.get_rowstride()
    194         if planes==0:
    195             #magic: if planes==0, this is an XImageWrapper... with raw pixels/rowstride
    196             input = [input]
    197             strides = [strides]
    198             planes = 1
    199         #print("convert_image(%s) input=%s, strides=%s" % (image, len(input), strides))
    200         assert len(input)==planes, "expected %s planes but found %s" % (planes, len(input))
    201         assert len(strides)==planes, "expected %s rowstrides but found %s" % (planes, len(strides))
    202         for i in range(planes):
    203             input_stride[i] = strides[i]
    204             PyObject_AsReadBuffer(input[i], <const void**> &input_image[i], &pic_buf_len)
    205         start = time.time()
    206         with nogil:
    207             result = csc_image(self.context, input_image, input_stride, output_image, output_stride)
    208         if result != 0:
    209             return None
    210         end = time.time()
    211         self.time += (end-start)
    212         self.frames += 1
    213         #now parse the output:
    214         csci = CSCImage()           #keep a reference to memory for cleanup
    215         if self.dst_format.endswith("P"):
    216             nplanes = 3
    217             divs = get_subsampling_divs(self.dst_format)
    218             #print("convert_image(%s) nplanes=%s, divs=%s" % (image, nplanes, divs))
    219             out = []
    220             strides = []
    221             for i in range(nplanes):
    222                 _, dy = divs[i]
    223                 if dy==1:
    224                     height = self.dst_height
    225                 elif dy==2:
    226                     height = (self.dst_height+1)>>1
    227                 else:
    228                     raise Exception("invalid height divisor %s" % dy)
    229                 stride = output_stride[i]
    230                 if stride>0 and output_image[i]!=NULL:
    231                     plane = PyBuffer_FromMemory(<void *>output_image[i], height * stride)
    232                 else:
    233                     stride = 0
    234                     plane = None
    235                 csci.set_plane(i, output_image[i])
    236                 out.append(plane)
    237                 strides.append(stride)
    238         else:
    239             nplanes = 0
    240             strides = output_stride[0]
    241             out = PyBuffer_FromMemory(<void *>output_image[0], self.dst_height * strides)
    242             csci.set_plane(0, output_image[0])
    243         out_image = CSCImageWrapper(0, 0, self.dst_width, self.dst_height, out, self.dst_format, 24, strides, nplanes)
    244         out_image.csc_image = csci
    245         return out_image
  • xpra/codecs/csc_nvcuda/csc_nvcuda.c

     
    1 /* This file is part of Xpra.
    2  * Copyright (C) 2013 Arthur Huillet
    3  * Xpra is released under the terms of the GNU GPL v2, or, at your option, any
    4  * later version. See the file COPYING for details.
    5  */
    6 
    7 #include <stdio.h>
    8 #include <stdlib.h>
    9 #include <string.h>
    10 #include <stdarg.h>
    11 
    12 #include <cuda.h>
    13 #include <cuda_runtime.h>
    14 #include <npp.h>
    15 
    16 #ifndef _WIN32
    17 #include <stdint.h>
    18 #include <unistd.h>
    19 #else
    20 #include "stdint.h"
    21 #include "inttypes.h"
    22 #endif
    23 
    24 #include "csc_nvcuda.h"
    25 
    26 static int cuda_device = -1;
    27 static int cuda_initialized = 0;
    28 static CUcontext *cuda_context;
    29 
    30 enum colorspace {
    31         UNKNOWN=-1,
    32         RGB = 0,
    33         RGBA,
    34         BGR,
    35         BGRA,
    36         YUV420P,
    37         YUV422P,
    38         YUV444P,
    39 };
    40 
    41 struct csc_nvcuda_ctx {
    42         int width;
    43         int height;
    44         enum colorspace src_colorspace;
    45         enum colorspace dst_colorspace;
    46 };
    47 
    48 static const struct {
    49         enum colorspace cspace;
    50         const char *name;
    51 } colorspaces[] = {
    52                 { RGB,     "RGB"     },
    53                 { RGBA,    "RGBA"    },
    54                 { BGR,     "BGR"     },
    55                 { BGRA,    "BGRX"    },
    56                 { YUV420P, "YUV420P" },
    57                 { YUV422P, "YUV422P" },
    58                 { YUV444P, "YUV444P" },
    59 };
    60 
    61 
    62 /* Representing the functions in a single table would be quite difficult.
    63    Instead, we use several tables to represent the Npp functions to be called.
    64    */
    65 typedef NppStatus (*packed_to_subsampled_planar_func) (const Npp8u * pSrc, int nSrcStep, Npp8u * pDst[3], int rDstStep[3], NppiSize oSizeROI);
    66 typedef NppStatus (*packed_to_planar_func) (const Npp8u * pSrc, int nSrcStep, Npp8u * pDst[3], int DstStep, NppiSize oSizeROI);
    67 
    68 static packed_to_planar_func NPP_dst_YUV444P[] = {
    69         [RGB] = nppiRGBToYCbCr_8u_C3P3R,
    70         [RGBA] = nppiRGBToYCbCr_8u_AC4P3R,
    71         [BGR] = NULL, // not present in NPP, need nppiSwapChannels first
    72         [BGRA] = NULL, // same as above
    73 };
    74 
    75 static packed_to_subsampled_planar_func NPP_dst_YUV422P[] = {
    76         [RGB] = nppiRGBToYCbCr422_8u_C3P3R,
    77         [RGBA] = NULL, //WTF?
    78         [BGR] = nppiBGRToYCbCr422_8u_C3P3R,
    79         [BGRA] = nppiBGRToYCbCr422_8u_AC4P3R,
    80 };
    81 
    82 static packed_to_subsampled_planar_func NPP_dst_YUV420P[] = {
    83         [RGB] =  nppiRGBToYCbCr420_8u_C3P3R,
    84         [RGBA] =  NULL, //WTF?
    85         [BGR] =  nppiBGRToYCbCr420_8u_C3P3R,
    86         [BGRA] =  nppiBGRToYCbCr420_8u_AC4P3R,
    87 };
    88 
    89 #define ARRAY_SIZE(X) (int)(sizeof(X)/sizeof(X[0]))
    90 static enum colorspace get_colorspace_by_name(const char *str)
    91 {
    92         int i;
    93         if (!str)
    94                 return UNKNOWN;
    95 
    96         for (i = 0; i < ARRAY_SIZE(colorspaces); i++) {
    97                 if (!strcmp(str, colorspaces[i].name))
    98                         return colorspaces[i].cspace;
    99         }
    100 
    101         fprintf(stderr, "Colorspace %s not supported.\n", str);
    102         return UNKNOWN;
    103 }
    104 
    105 /* Retrieve the conversion function for a un-subsampled planar destination.
    106  This cannot be unified because the NPP signatures are different from the other variants.*/
    107 packed_to_planar_func get_conversion_function_444(enum colorspace src, enum colorspace dst)
    108 {
    109         if (dst != YUV444P)
    110                 return NULL;
    111 
    112         if (src >= ARRAY_SIZE(NPP_dst_YUV444P)) {
    113                 fprintf(stderr, "Source colorspace %d not supported for YUV444P destination\n", src);
    114                 return NULL;
    115         }
    116 
    117         return NPP_dst_YUV444P[src];
    118 }
    119 
    120 /* Retrieve the conversion function for a subsampled planar destination. */
    121 packed_to_subsampled_planar_func get_conversion_function_subsampled(enum colorspace src, enum colorspace dst)
    122 {
    123 #define get_func(ARR) do { \
    124                 if (src >= ARRAY_SIZE(ARR)) \
    125                         return NULL; \
    126                 return ARR[src];        \
    127                 } while (0)
    128         if (dst == YUV420P) {
    129                 get_func(NPP_dst_YUV420P);
    130         } else if (dst == YUV422P) {
    131                 get_func(NPP_dst_YUV422P);
    132         } else {
    133                 fprintf(stderr, "Destination colorspace %d not supported as subsampled dest.\n", dst);
    134                 return NULL;
    135         }
    136 }
    137 
    138 static void _cuda_report_error(int line, const char *fmt, ...)
    139 {
    140         fprintf(stderr, "Cuda error in %s:%d: ", __FILE__, line);
    141         va_list ap;
    142         va_start(ap, fmt);
    143         vfprintf(stderr, fmt, ap);
    144         va_end(ap);
    145         fprintf(stderr, " - %s\n", cudaGetErrorString(cudaGetLastError()));
    146 }
    147 
    148 #define cuda_err(fmt, ...) _cuda_report_error(__LINE__, fmt, ##__VA_ARGS__)
    149 
    150 const char *get_NPP_version(void)
    151 {
    152 #define xstr(s) str(s)
    153 #define str(s) #s
    154         return "" xstr(NPP_VERSION_MAJOR) "." xstr(NPP_VERSION_MINOR) "." xstr(NPP_VERSION_BUILD);
    155 }
    156 
    157 int init_cuda(void)
    158 {
    159         int cuda_count = 0;
    160         char PCI_id[25];
    161         struct cudaDeviceProp prop;
    162 
    163         if (cudaGetDeviceCount(&cuda_count)) {
    164                 fprintf(stderr, "No CUDA devices available.\n");
    165         }
    166 
    167         int i;
    168         for (i = 0; i < cuda_count; i++) {
    169                 cudaSetDevice(i);
    170 
    171                 // Retrieve device properties
    172                 if (cudaGetDeviceProperties(&prop, i)) {
    173                         cuda_err("Error retrieving Cuda device %d properties, skipping", i);
    174                         continue;
    175                 }
    176 
    177                 // Check if device is able to map host memory
    178                 if (!prop.canMapHostMemory) {
    179                         cuda_err("Device %d cannot map host memory, skipping", i);
    180                         continue;
    181                 }
    182        
    183                 // Tell CUDA we want to map host memory
    184                 if(cudaSetDeviceFlags(cudaDeviceMapHost)) {
    185                         cuda_err("Unable to set cudaDeviceMapHost device flag");
    186                         return 1;
    187                 }
    188 
    189                 // All good - select this device!
    190                 break;
    191         }
    192 
    193         if (i == cuda_count) {
    194                 fprintf(stderr, "No suitable CUDA devices available.\n");
    195                 return 1;
    196         }
    197 
    198         // Select this device
    199         cuda_device = i;
    200 
    201         const NppLibraryVersion *lib_version = nppGetLibVersion();
    202 
    203         // Report status
    204                 // This call initializes the device for real, instead of it being done later when converting frames
    205         cudaDeviceGetPCIBusId(PCI_id, sizeof(PCI_id), cuda_device);
    206         printf("Using CUDA device %s at %s, NPP version %d.%d.%d\n", nppGetGpuName(), PCI_id, lib_version->major, lib_version->minor, lib_version->build);
    207 
    208         if (cuInit(0)) {
    209                 fprintf(stderr, "cuInit failed\n");
    210         }
    211 
    212         //printf("curren = %p\n", cuCtxGetCurrent(cuda_context));
    213         printf("Cuda context ptr %p\n", cuda_context);
    214         cuda_initialized = 1;
    215         return 0;
    216 }
    217 
    218 struct csc_nvcuda_ctx *init_csc(int width, int height, const char *src_format_str, const char *dst_format_str)
    219 {
    220         struct csc_nvcuda_ctx *ctx = malloc(sizeof(struct csc_nvcuda_ctx));
    221         if (!ctx)
    222                 return NULL;
    223        
    224         ctx->width = width;
    225         ctx->height = height;
    226         ctx->src_colorspace = get_colorspace_by_name(src_format_str);
    227         ctx->dst_colorspace = get_colorspace_by_name(dst_format_str);
    228 
    229         // Check if we have a conversion function for src->dst
    230         void *func;
    231         if (ctx->dst_colorspace == YUV444P) {
    232                 func = get_conversion_function_444(ctx->src_colorspace, ctx->dst_colorspace);
    233         } else {
    234                 func = get_conversion_function_subsampled(ctx->src_colorspace, ctx->dst_colorspace);
    235         }
    236         if (!func) {
    237                 fprintf(stderr, "Colorspace conversion with source %s and destination %s is not supported by csc_nvcuda.\n", src_format_str, dst_format_str);
    238                 goto err;
    239         }
    240 
    241         // Initialize Cuda (once in the application's lifetime)                 
    242         if (!cuda_initialized) {
    243                 fprintf(stderr, "Cuda was not initialized - please call init_cuda() before init_csc(). Initializing Cuda...\n");
    244                 if (init_cuda()) {
    245                         goto err;
    246                 }
    247         }
    248 
    249         return ctx;
    250 err:
    251         free(ctx);
    252         return NULL;
    253 }
    254 
    255 int csc_image(struct csc_nvcuda_ctx *ctx, const uint8_t *in[3], const int stride[3], uint8_t *out[3], int out_stride[3])
    256 {
    257         if (!ctx)
    258                 return 1;
    259 
    260         int pinned_input_buffer = 1;
    261         int pinned_output_buffer = 1;
    262         NppiSize size = { ctx->width, ctx->height };
    263         Npp8u *src = NULL; // GPU-side input buffer
    264         //uint8_t *dstbuf = NULL; // CPU-side linear output buffer (data + strides)
    265         uint8_t *gpudst[3] = { NULL, NULL, NULL }; // GPU-side planar output array
    266 
    267         // Plane dimensions
    268         int y_width = ctx->width;
    269         int uv_width = ctx->width;
    270         int uv_height = ctx->height;
    271 
    272         switch (ctx->dst_colorspace) {
    273                 case YUV420P:
    274                         uv_height /=  2;
    275                         /* fall through */
    276                 case YUV422P:
    277                         uv_width /= 2;
    278                         break;
    279                 case YUV444P:
    280                         ;
    281                 default:
    282                         fprintf(stderr, "%s: Unimplemented destination colorspace: %d\n", __FUNCTION__, ctx->dst_colorspace);
    283                         return 1;
    284         }
    285 
    286 
    287         // Pin CPU input buffer if possible
    288         if (cudaHostRegister((void *)in[0], stride[0]*ctx->height, cudaHostRegisterMapped)) {
    289                 pinned_input_buffer = 0;
    290         }
    291                
    292         // Allocate GPU input buffer
    293         if (cudaMalloc((void *)&src, stride[0]*ctx->height)) {
    294                 cuda_err("cudaMalloc input buf");
    295                 goto err0;
    296         }
    297 
    298         // Copy input data to GPU buffer
    299         if (pinned_input_buffer) {
    300                 // Use asynchronous copy if the buffer is pinned
    301                 if (cudaMemcpyAsync(src, in[0], stride[0]*ctx->height, cudaMemcpyHostToDevice, 0)) {
    302                         cuda_err("cudaMemcpyAsync input buf");
    303                         goto err1;
    304                 }
    305         } else {
    306                 if (cudaMemcpy(src, in[0], stride[0]*ctx->height, cudaMemcpyHostToDevice)) {
    307                         cuda_err("cudaMemcpy input buf");
    308                         goto err1;
    309                 }
    310         }
    311 
    312         cudaDeviceSynchronize();
    313        
    314 
    315         // Allocate GPU output buffer
    316         cudaMallocPitch((void *)&gpudst[0], (void *)&out_stride[0], y_width, ctx->height);
    317         cudaMallocPitch((void *)&gpudst[1], (void *)&out_stride[1], uv_width, uv_height);
    318         cudaMallocPitch((void *)&gpudst[2], (void *)&out_stride[2], uv_width, uv_height);
    319 
    320         // Allocate CPU output buffer
    321         out[0] = malloc(out_stride[0] * ctx->height + (out_stride[1] + out_stride[2]) * uv_height);
    322         out[1] = out[0] + out_stride[0] * ctx->height;
    323         out[2] = out[1] + out_stride[1] * uv_height;
    324         printf("instride %d\nCPU input:\t%p\n->GPU input:\t%p\noutstride %d\t%d\t%d\nCPU output:\t%p\t%p\t%p\n->GPU output:\t%p\t%p\t%p\n", stride[0], in[0], src, out_stride[0], out_stride[1], out_stride[2], out[0], out[1], out[2], gpudst[0], gpudst[1], gpudst[2]);
    325        
    326         // Pin output buffer if possible
    327         if (cudaHostRegister((void *)out[0], (out_stride[0] + out_stride[1] + out_stride[2]) * ctx->height, cudaHostRegisterMapped)) {
    328                 pinned_output_buffer = 0;
    329         }
    330 
    331         packed_to_subsampled_planar_func func = NULL;
    332         packed_to_planar_func func2 = NULL;
    333         int err = 0;
    334 
    335         if (ctx->dst_colorspace == YUV444P) {
    336                 func2 = get_conversion_function_444(ctx->src_colorspace, ctx->dst_colorspace);
    337                 if (func2)
    338                         err = func2(src, stride[0], gpudst, out_stride[0], size);
    339                 else goto err2;
    340         } else {
    341                 func = get_conversion_function_subsampled(ctx->src_colorspace, ctx->dst_colorspace);
    342                 if (func)
    343                         err = func(src, stride[0], gpudst, out_stride, size);
    344                 else goto err2;
    345         }
    346 
    347         cudaDeviceSynchronize();
    348         if (err) {
    349                 const char *str = NULL;
    350                 switch (err) {
    351                         case -4: str = "NPP_NULL_POINTER_ERROR"; break;
    352                         case -7: str = "NPP_STEP_ERROR"; break;
    353                         case -8: str = "NPP_ALIGNMENT_ERROR"; break;
    354                         case -19: str = "NPP_NOT_EVEN_STEP_ERROR"; break;
    355                         default:
    356                                           str = "(unknown)";
    357                 }
    358                 fprintf(stderr, "nppiRGBToYCbCr420_8u_C3P3R failed: %d - %s\n", err, str);
    359                 goto err2;
    360         }
    361 
    362         if (pinned_output_buffer) {
    363                 if (cudaMemcpyAsync(out[0], gpudst[0], out_stride[0] * ctx->height, cudaMemcpyDeviceToHost, 0) ||
    364                         cudaMemcpyAsync(out[1], gpudst[1], out_stride[1] * uv_height, cudaMemcpyDeviceToHost, 0) ||
    365                         cudaMemcpyAsync(out[2], gpudst[2], out_stride[2] * uv_height, cudaMemcpyDeviceToHost, 0)) {
    366                         cuda_err("cudaMemcpyAsync output buf");
    367                         goto err2;
    368                 }
    369         } else {
    370                 if (cudaMemcpy(out[0], gpudst[0], out_stride[0] * ctx->height, cudaMemcpyDeviceToHost) ||
    371                         cudaMemcpy(out[1], gpudst[1], out_stride[1] * uv_height, cudaMemcpyDeviceToHost) ||
    372                         cudaMemcpy(out[2], gpudst[2], out_stride[2] * uv_height, cudaMemcpyDeviceToHost)) {
    373                         cuda_err("cudaMemcpy output buf");
    374                         goto err2;
    375                 }
    376         }
    377         cudaDeviceSynchronize();
    378 
    379         // Free GPU output buffer
    380         cudaFree(gpudst[0]);
    381         cudaFree(gpudst[1]);
    382         cudaFree(gpudst[2]);
    383         // Free GPU input buffer
    384         cudaFree(src);
    385 
    386         // Un-pin CPU buffers
    387         if (pinned_input_buffer) {
    388                 cudaHostUnregister((void *)in);
    389         }
    390         if (pinned_output_buffer) {
    391                 cudaHostUnregister((void *)out[0]);
    392         }
    393 
    394         return 0;
    395 
    396 err2:
    397         if (pinned_output_buffer)
    398                 cudaHostUnregister((void *)out[0]);
    399         cudaFree(gpudst[0]);
    400         cudaFree(gpudst[1]);
    401         cudaFree(gpudst[2]);
    402         free(out[0]);
    403 err1:
    404         cudaFree(src);
    405 err0:
    406         if (pinned_input_buffer)
    407                 cudaHostUnregister((void *)in);
    408         return 1;
    409 }
    410 
    411 int free_csc_image(uint8_t *buf[3])
    412 {
    413         free(buf[0]);
    414         return 0;
    415 }
    416 
    417 void free_csc(struct csc_nvcuda_ctx *ctx)
    418 {
    419         return;
    420 }
    421 
    422 const char *get_flags_description(struct csc_nvcuda_ctx *ctx) {
    423         return "";
    424 }
  • xpra/codecs/csc_nvcuda/csc_nvcuda.h

     
    1 /* This file is part of Xpra.
    2  * Copyright (C) 2013 Arthur Huillet
    3  * Xpra is released under the terms of the GNU GPL v2, or, at your option, any
    4  * later version. See the file COPYING for details.
    5  */
    6 
    7 #ifdef _WIN32
    8 #include "stdint.h"
    9 #include "inttypes.h"
    10 #else
    11 #include "stdint.h"
    12 #endif
    13 
    14 #ifdef _WIN32
    15 #define _STDINT_H
    16 #endif
    17 
    18 /** Opaque structure - "context". You must have a context to convert frames */
    19 struct csc_nvcuda_ctx;
    20 
    21 /**
    22  * Initialize Cuda. Call this before doing anything else.
    23  * Will be called by init_csc if you forget.
    24  * @return 0 if OK, non zero on error
    25  */
    26 int init_cuda(void);
    27 
    28 /**
    29  * Return the version of nvidia performance primitives,
    30  * the lib used by csc_nvcuda for CSC.
    31  */
    32 const char *get_NPP_version(void);
    33 
    34 /** Create a CSC context
    35  * @return NULL on error
    36  */
    37 struct csc_nvcuda_ctx *init_csc(int width, int height, const char *src_format_str, const char *dst_format_str);
    38 
    39 /** Free a CSC context */
    40 void free_csc(struct csc_nvcuda_ctx *ctx);
    41 
    42 /** Colorspace conversion.
    43  * Note: you must call free_csc_image() to free the image buffer.
    44  @param in: Input buffer planes.
    45  @param stride: Input strides.
    46  @param out: Array of pointers to be set to point to data planes.
    47  @param out_stride: Array of strides
    48  @return: 0 if OK, 1 on error
    49 */
    50 int csc_image(struct csc_nvcuda_ctx *ctx, const uint8_t *in[3], const int in_stride[3], uint8_t *out[3], int out_stride[3]);
    51 
    52 /**
    53  * Free the output of RGB 2 YUV conversion. You have to pass the pointer to the Y plane. This function will
    54  * free all planes at once.
    55  */
    56 int free_csc_image(uint8_t *buf[3]);
    57 
  • setup.py

     
    477477                   "xpra/codecs/dec_avcodec/constants.pxi",
    478478                   "xpra/codecs/csc_swscale/colorspace_converter.c",
    479479                   "xpra/codecs/csc_swscale/constants.pxi",
    480                    "xpra/codecs/csc_nvcuda/colorspace_converter.c",
    481480                   "xpra/codecs/xor/cyxor.c",
    482481                   "xpra/codecs/argb/argb.c",
    483482                   "xpra/server/stats/cymaths.c",
     
    957956
    958957toggle_packages(nvenc_ENABLED, "xpra.codecs.nvenc")
    959958toggle_packages(csc_opencl_ENABLED, "xpra.codecs.csc_opencl")
     959toggle_packages(csc_nvcuda_ENABLED, "xpra.codecs.csc_nvcuda")
    960960
    961961toggle_packages(enc_x264_ENABLED, "xpra.codecs.enc_x264")
    962962if enc_x264_ENABLED:
     
    997997                ["xpra/codecs/csc_swscale/colorspace_converter.pyx", "xpra/codecs/csc_swscale/csc_swscale.c", "xpra/codecs/memalign/memalign.c"],
    998998                **swscale_pkgconfig), min_version=(0, 19))
    999999
    1000 toggle_packages(csc_nvcuda_ENABLED, "xpra.codecs.csc_nvcuda")
    1001 if csc_nvcuda_ENABLED:
    1002     cuda_pkgconfig = pkgconfig("cuda")
    1003     cython_add(Extension("xpra.codecs.csc_nvcuda.colorspace_converter",
    1004                 ["xpra/codecs/csc_nvcuda/colorspace_converter.pyx", "xpra/codecs/csc_nvcuda/csc_nvcuda.c"],
    1005                 **cuda_pkgconfig), min_version=(0, 16))
    1006 
    10071000toggle_packages(vpx_ENABLED, "xpra.codecs.vpx")
    10081001if vpx_ENABLED:
    10091002    if vpx_static_ENABLED: