xpra icon
Bug tracker and wiki

This bug tracker and wiki are being discontinued
please use https://github.com/Xpra-org/xpra instead.


Ticket #370: nvenc-pycuda-with-kernel3.patch

File nvenc-pycuda-with-kernel3.patch, 26.8 KB (added by Antoine Martin, 8 years ago)

use (py)cuda to copy from input buffer (already in NV12 format) to output buffer (nvenc buffer)

  • xpra/codecs/nvenc/CUDA_rgb2nv12.py

     
     1# This file is part of Xpra.
     2# Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk>
     3# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
     4# later version. See the file COPYING for details.
     5
     6
     7BGRA2NV12_kernel = """
     8#include <stdint.h>
     9
     10__global__ void BGRA2NV12(uint8_t *srcImage,    int srcPitch,
     11                          uint8_t *dstImage,    int dstPitch,
     12                          int width,            int height)
     13{
     14    int32_t x, y;
     15    x = blockIdx.x * blockDim.x + threadIdx.x;
     16    y = blockIdx.y * blockDim.y + threadIdx.y;
     17
     18    if (x >= width)
     19        return;
     20    if (y >= height)
     21        return;
     22
     23    //just copy for now...
     24    dstImage[y * dstPitch + x] = srcImage[y * srcPitch + x];
     25}
     26"""
  • xpra/codecs/nvenc/encoder.pyx

     
    66import binascii
    77import time
    88import os
     9import numpy
    910
    1011from xpra.codecs.image_wrapper import ImageWrapper
    1112from xpra.log import Logger, debug_if_env
     
    3637    void* malloc(size_t __size)
    3738    void free(void* mem)
    3839
    39 #could also use pycuda...
    4040cdef extern from "cuda.h":
    41     ctypedef int CUdevice
    4241    ctypedef int CUresult
    43     ctypedef void* CUdeviceptr
    4442    ctypedef void* CUcontext
    45     ctypedef enum CUdevice_attribute:
    46         CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
    47         CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH
    48         CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT
    49         CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
    50         CU_DEVICE_ATTRIBUTE_PCI_BUS_ID
    51         CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID
    52         CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
     43    CUresult cuCtxGetCurrent(CUcontext *pctx)
    5344
    54     CUresult cuInit(unsigned int flags)
    55     CUresult cuDeviceGet(CUdevice *device, int ordinal)
    56     CUresult cuDeviceGetCount(int *count)
    57     CUresult cuDeviceGetName(char *name, int len, CUdevice dev)
    58     CUresult cuDeviceComputeCapability(int *major, int *minor, CUdevice dev)
    59 
    60     CUresult cuDeviceTotalMem(size_t *bytes, CUdevice dev)
    61     CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev)
    62 
    63     CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
    64     CUresult cuCtxPopCurrent(CUcontext *pctx)
    65     CUresult cuCtxPushCurrent(CUcontext ctx)
    66     CUresult cuCtxDestroy(CUcontext ctx)
    67 
    68     CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
    69     CUresult cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes)
    70     CUresult cuMemAllocHost(void **pp, size_t bytesize)
    71     CUresult cuMemFree(CUdeviceptr dptr)
    72     CUresult cuMemFreeHost(void *p)
    73 
    74 
    7545cdef extern from "NvTypes.h":
    7646    pass
    7747
     
    10431013    #ratings: quality, speed, setup cost, cpu cost, gpu cost, latency, max_w, max_h, max_pixels
    10441014    return codec_spec(Encoder, codec_type=get_type(), encoding=encoding,
    10451015                      quality=60, setup_cost=100, cpu_cost=10, gpu_cost=100,
    1046                       min_w=2, min_h=2, max_w=4096, max_h=4096,
     1016                      min_w=2, min_h=2,
     1017                      #we may want to limit to 32x32 minimum because of the CUDA kernel
     1018                      #min_w=32, min_h=32,
     1019                      max_w=4096, max_h=4096,
    10471020                      width_mask=0xFFFE, height_mask=0xFFFE)
    10481021
    10491022
     
    10611034    return (n + m - 1) & ~(m - 1)
    10621035
    10631036
    1064 CUDA_STATUS_TXT = {}
    1065 cdef cudaStatusInfo(CUresult ret):
    1066     if ret in CUDA_STATUS_TXT:
    1067         return "%s: %s" % (ret, CUDA_STATUS_TXT[ret])
    1068     return str(ret)
     1037def device_info(d):
     1038    return "%s @ %s" % (d.name(), d.pci_bus_id())
    10691039
    1070 cdef checkCuda(CUresult ret, msg=""):
    1071     if ret!=0:
    1072         log.warn("error during %s: %s", msg, cudaStatusInfo(ret))
    1073     return ret
    1074 cdef raiseCUDA(CUresult ret, msg=""):
    1075     if ret!=0:
    1076         raise Exception("%s - returned %s" % (msg, cudaStatusInfo(ret)))
    1077 
    10781040cdef cuda_init_devices():
    1079     cdef int deviceCount, i
    1080     cdef CUdevice cuDevice
    1081     cdef char gpu_name[100]
    1082     cdef int SMminor, SMmajor
    1083     cdef size_t totalMem
    1084     cdef int multiProcessorCount
    1085     cdef int max_width, max_height
    1086     cdef int canMapHostMemory
    1087     cdef int computeMode
    1088     cdef int pciBusID, pciDeviceID
    1089     cdef CUresult r
    10901041    start = time.time()
    10911042    log.info("CUDA initialization (this may take a few seconds)")
    1092     raiseCUDA(cuInit(0), "cuInit")
    1093     debug("cuda_init_devices() cuInit() took %.1fms", 1000.0*(time.time()-start))
    1094     raiseCUDA(cuDeviceGetCount(&deviceCount), "failed to get device count")
    1095     debug("cuda_init_devices() found %s devices", deviceCount)
     1043    import pycuda
     1044    from pycuda import driver
     1045    driver.init()
     1046    ngpus = driver.Device.count()
     1047    debug("PyCUDA found %s devices:", ngpus)
    10961048    devices = {}
    1097     for i in range(deviceCount):
    1098         r = cuDeviceGet(&cuDevice, i)
    1099         checkCuda(r, "cuDeviceGet")
    1100         if r!=0:
    1101             continue
    1102         checkCuda(cuDeviceGetName(gpu_name, 100, cuDevice), "cuDeviceGetName")
    1103         checkCuda(cuDeviceComputeCapability(&SMmajor, &SMminor, i), "cuDeviceComputeCapability")
     1049    da = driver.device_attribute
     1050    for i in range(ngpus):
     1051        d = driver.Device(i)
     1052        mem = d.total_memory()
     1053        host_mem = d.get_attribute(da.CAN_MAP_HOST_MEMORY)
     1054        debug(" max block sizes: (%s, %s, %s)", d.get_attribute(da.MAX_BLOCK_DIM_X), d.get_attribute(da.MAX_BLOCK_DIM_Y), d.get_attribute(da.MAX_BLOCK_DIM_Z))
     1055        debug(" max grid sizes: (%s, %s, %s)", d.get_attribute(da.MAX_GRID_DIM_X), d.get_attribute(da.MAX_GRID_DIM_Y), d.get_attribute(da.MAX_GRID_DIM_Z))
     1056        #SMmajor, SMminor = d.compute_cabability()
     1057        SMmajor, SMminor = 0xFFFF, 0xFFFF
    11041058        has_nvenc = ((SMmajor<<4) + SMminor) >= 0x30
    1105         cuDeviceGetAttribute(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cuDevice)
    1106         cuDeviceGetAttribute(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cuDevice)
    1107         raiseCUDA(cuDeviceTotalMem(&totalMem, cuDevice), "cuDeviceTotalMem")
    1108         debug("device[%s]=%s (%sMB) - PCI: %02d:%02d - compute %s.%s (nvenc=%s)",
    1109                 i, gpu_name, int(totalMem/1024/1024), pciBusID, pciDeviceID, SMmajor, SMminor, has_nvenc)
    1110         devices[i] = "%s - PCI: %02d:%02d" % (gpu_name, pciBusID, pciDeviceID)
    1111         cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice)
    1112         #log.info("multiProcessorCount=%s", multiProcessorCount)
    1113         #printf("  (%2d) Multiprocessors x (%3d) CUDA Cores/MP:    %d CUDA Cores\n",
    1114                 #multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
    1115                 #_ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
    1116         cuDeviceGetAttribute(&max_width, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, cuDevice)
    1117         cuDeviceGetAttribute(&max_height, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, cuDevice)
    1118         cuDeviceGetAttribute(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)
    1119         cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cuDevice)
    1120         debug(" Can Map Host Memory: %s, Compute Mode: %s, MultiProcessor Count: %s, max dimensions: %sx%s", canMapHostMemory, computeMode, multiProcessorCount, max_width, max_height)
     1059        pre = "-"
     1060        if host_mem and has_nvenc:
     1061            pre = "+"
     1062            devices[i] = device_info(d)
     1063        debug(" %s %s (%sMB)", pre, device_info(d), mem/1024/1024)
     1064        max_width = d.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH)
     1065        max_height = d.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT)
     1066        debug(" Can Map Host Memory: %s, Compute Mode: %s, max dimensions: %sx%s", host_mem, (SMmajor, SMminor), max_width, max_height)
    11211067    end = time.time()
    11221068    debug("cuda_init_devices() took %.1fms", 1000.0*(end-start))
    11231069    return devices
     
    11311077            log.info(" + %s", cuda_devices.get(device_id))
    11321078    return cuda_devices
    11331079
    1134 cdef CUdevice get_cuda_device(deviceId=0):
    1135     global cuda_devices
    1136     cdef CUdevice cuDevice            #@DuplicatedSignature
    1137     cdef char gpu_name[100]           #@DuplicatedSignature
    1138     cdef int SMminor, SMmajor         #@DuplicatedSignature
    1139     if deviceId<0:
    1140         deviceId = 0
    1141     if deviceId not in cuda_devices:
    1142         raise Exception("invalid deviceId %s: only %s devices found" % (deviceId, len(cuda_devices)))
    1143     raiseCUDA(cuDeviceGet(&cuDevice, deviceId), "cuDeviceGet")
    1144     raiseCUDA(cuDeviceGetName(gpu_name, 100, cuDevice), "cuDeviceGetName")
    1145     debug("using CUDA device %s: %s", deviceId, gpu_name)
    1146     raiseCUDA(cuDeviceComputeCapability(&SMmajor, &SMminor, deviceId), "cuDeviceComputeCapability")
    1147     has_nvenc = ((SMmajor<<4) + SMminor) >= 0x30
    1148     if FORCE and not has_nvenc:
    1149         log.warn("selected device %s does not have NVENC capability!" % gpu_name)
    1150     else:
    1151         assert has_nvenc, "selected device %s does not have NVENC capability!" % gpu_name
    1152     return cuDevice
    1153 
    11541080DEFAULT_CUDA_DEVICE_ID = int(os.environ.get("XPRA_CUDA_DEVICE", "0"))
    11551081
    11561082def cuda_check():
    1157     cdef CUcontext context
    1158     cdef CUdevice cuDevice              #@DuplicatedSignature
    1159     cuda_devices = get_cuda_devices()
    1160     if len(cuda_devices)==0:
     1083    global DEFAULT_CUDA_DEVICE_ID
     1084    devices = get_cuda_devices()
     1085    if len(devices)==0:
    11611086        raise ImportError("no CUDA devices found!")
    1162     assert DEFAULT_CUDA_DEVICE_ID in cuda_devices.keys(), "specified CUDA device ID %s not found in %s" % (DEFAULT_CUDA_DEVICE_ID, cuda_devices)
    1163     cuDevice = get_cuda_device(DEFAULT_CUDA_DEVICE_ID)
     1087    assert DEFAULT_CUDA_DEVICE_ID in cuda_devices.keys(), "specified CUDA device ID %s not found in %s" % (DEFAULT_CUDA_DEVICE_ID, devices)
     1088    #create context for testing:
     1089    from pycuda import driver
     1090    d = driver.Device(DEFAULT_CUDA_DEVICE_ID)
     1091    context = d.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST)
     1092    debug("cuda_check created test context, api_version=%s", context.get_api_version())
     1093    context.pop()
     1094    context.detach()
    11641095
    1165     raiseCUDA(cuCtxCreate(&context, 0, cuDevice), "creating CUDA context")
    1166     raiseCUDA(cuCtxPopCurrent(&context), "popping current context")
    1167     raiseCUDA(cuCtxDestroy(context), "destroying current context")
    11681096
    1169 
    11701097cdef nvencStatusInfo(NVENCSTATUS ret):
    11711098    if ret in NV_ENC_STATUS_TXT:
    11721099        return "%s: %s" % (ret, NV_ENC_STATUS_TXT[ret])
     
    11771104        raise Exception("%s - returned %s" % (msg, nvencStatusInfo(ret)))
    11781105
    11791106
     1107#BGRA2NV12_functions = {}
     1108def get_BGRA2NV12():
     1109    from xpra.codecs.nvenc.CUDA_rgb2nv12 import BGRA2NV12_kernel
     1110    from pycuda.compiler import SourceModule
     1111    log.info("BGRA2NV12=%s", BGRA2NV12_kernel)
     1112    mod = SourceModule(BGRA2NV12_kernel)
     1113    BGRA2NV12_function = mod.get_function("BGRA2NV12")
     1114    return BGRA2NV12_function
     1115
     1116
    11801117cdef class Encoder:
    11811118    cdef int width
    11821119    cdef int height
    11831120    cdef int encoder_width
    11841121    cdef int encoder_height
    11851122    cdef object src_format
    1186     cdef CUcontext cuda_context
     1123    #PyCUDA:
     1124    cdef object driver
     1125    cdef object cuda_device
     1126    cdef object cuda_context
     1127    cdef object BGRA2NV12
     1128    cdef object max_block_sizes
     1129    cdef object max_grid_sizes
     1130    cdef int max_threads_per_block
     1131    #NVENC:
    11871132    cdef NV_ENCODE_API_FUNCTION_LIST functionList               #@DuplicatedSignature
    11881133    cdef void *context
    11891134    cdef NV_ENC_REGISTERED_PTR inputHandle
    1190     cdef CUdeviceptr cudaBuffer
    1191     cdef void *inputBuffer
    1192     cdef size_t pitch
     1135    cdef object inputBuffer
     1136    cdef object cudaInputBuffer
     1137    cdef object cudaNV12Buffer
     1138    cdef int inputPitch
     1139    cdef int NV12Pitch
    11931140    cdef void *bitstreamBuffer
    11941141    cdef NV_ENC_BUFFER_FORMAT bufferFmt
    11951142    cdef object codec_name
     
    12261173        self.codec_name = "H264"
    12271174        self.preset_name = None
    12281175        self.frames = 0
    1229         self.cuda_context = NULL
     1176        self.cuda_device = None
     1177        self.cuda_context = None
    12301178        start = time.time()
    12311179
    12321180        device_id = options.get("cuda_device", DEFAULT_CUDA_DEVICE_ID)
     
    12371185
    12381186    def init_cuda(self, device_id):
    12391187        assert device_id in get_cuda_devices().keys(), "invalid device_id '%s' (available: %s)" % (device_id, cuda_devices)
    1240         cdef CUdevice cuda_device              #@DuplicatedSignature
    1241         cuda_device = get_cuda_device(DEFAULT_CUDA_DEVICE_ID)
    1242         raiseCUDA(cuCtxCreate(&self.cuda_context, 0, cuda_device), "cuCtxCreate")
    1243         debug("cuCtxCreate: device_id=%s, cuda_device=%s, cuda_context=%s", device_id, cuda_device, hex(<long> self.cuda_context))
    1244         #allocate CUDA input buffer (on device):
    1245         raiseCUDA(cuMemAllocPitch(&self.cudaBuffer, &self.pitch, self.encoder_width, self.encoder_height*3/2, 16), "allocating CUDA input buffer on device")
    1246         debug("cudaBuffer=%s, pitch=%s", hex(<long> self.cudaBuffer), self.pitch)
    1247         #allocate buffer on host:
    1248         raiseCUDA(cuMemAllocHost(&self.inputBuffer, self.pitch*self.encoder_height*3/2), "allocating CUDA input buffer on host")
    1249         debug("inputBuffer=%s", hex(<long> self.inputBuffer))
    12501188
    1251         self.init_nvenc()
     1189        from pycuda import driver
     1190        self.driver = driver
     1191        debug("init_cuda(%s)", device_id)
     1192        self.cuda_device = driver.Device(DEFAULT_CUDA_DEVICE_ID)
     1193        self.cuda_context = self.cuda_device.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST)
     1194        #use alias to make code easier to read:
     1195        d = self.cuda_device
     1196        da = driver.device_attribute
     1197        try:
     1198            debug("init_cuda(%s) cuda_device=%s, cuda_context=%s", device_id, self.cuda_device, self.cuda_context)
     1199            #compile/get kernel:
     1200            self.BGRA2NV12 = get_BGRA2NV12()
     1201            #allocate CUDA input buffer (on device):
     1202            self.cudaInputBuffer, self.inputPitch = driver.mem_alloc_pitch(self.encoder_width, self.encoder_height*3/2, 16)
     1203            debug("CUDA Input Buffer=%s, pitch=%s", hex(int(self.cudaInputBuffer)), self.inputPitch)
     1204            #allocate CUDA NV12 buffer (on device):
     1205            self.cudaNV12Buffer, self.NV12Pitch = driver.mem_alloc_pitch(self.encoder_width, self.encoder_height*3/2, 16)
     1206            debug("CUDA NV12 Buffer=%s, pitch=%s", hex(int(self.cudaNV12Buffer)), self.NV12Pitch)
     1207            #allocate input buffer on host:
     1208            #self.inputBuffer = driver.pagelocked_empty(self.encoder_width*self.encoder_height, dtype=numpy.byte)
     1209            self.inputBuffer = driver.pagelocked_zeros(self.inputPitch*self.encoder_height*3/2, dtype=numpy.byte)
     1210            debug("inputBuffer=%s", self.inputBuffer)
    12521211
    1253         raiseCUDA(cuCtxPopCurrent(&self.cuda_context), "cuCtxPopCurrent")
     1212            self.max_block_sizes = d.get_attribute(da.MAX_BLOCK_DIM_X), d.get_attribute(da.MAX_BLOCK_DIM_Y), d.get_attribute(da.MAX_BLOCK_DIM_Z)
     1213            self.max_grid_sizes = d.get_attribute(da.MAX_GRID_DIM_X), d.get_attribute(da.MAX_GRID_DIM_Y), d.get_attribute(da.MAX_GRID_DIM_Z)
     1214            debug("max_block_sizes=%s", self.max_block_sizes)
     1215            debug("max_grid_sizes=%s", self.max_grid_sizes)
    12541216
     1217            self.max_threads_per_block = self.BGRA2NV12.get_attribute(driver.function_attribute.MAX_THREADS_PER_BLOCK)
     1218            debug("max_threads_per_block=%s", self.max_threads_per_block)
     1219
     1220            self.init_nvenc()
     1221        finally:
     1222            self.cuda_context.pop()
     1223
    12551224    def init_nvenc(self):
    12561225        cdef GUID codec
    12571226        cdef GUID preset
     
    12591228        cdef NV_ENC_INITIALIZE_PARAMS params
    12601229        cdef NV_ENC_PRESET_CONFIG *presetConfig     #@DuplicatedSignature
    12611230        cdef NV_ENC_REGISTER_RESOURCE registerResource
     1231        cdef NV_ENC_CREATE_INPUT_BUFFER createInputBufferParams
     1232        cdef NV_ENC_CREATE_BITSTREAM_BUFFER createBitstreamBufferParams
     1233        cdef long resource
    12621234
    12631235        self.open_encode_session()
    12641236        codec = self.get_codec()
     
    12681240        input_format = BUFFER_FORMAT[self.bufferFmt]
    12691241        input_formats = self.query_input_formats(codec)
    12701242        assert input_format in input_formats, "%s does not support %s (only: %s)" %  (self.codec_name, input_format, input_formats)
    1271 
    1272         cdef NV_ENC_CREATE_INPUT_BUFFER createInputBufferParams
    1273         cdef NV_ENC_CREATE_BITSTREAM_BUFFER createBitstreamBufferParams
    12741243        try:
    12751244            presetConfig = self.get_preset_config(codec, preset)
    12761245
     
    12971266            memset(&registerResource, 0, sizeof(NV_ENC_REGISTER_RESOURCE))
    12981267            registerResource.version = NV_ENC_REGISTER_RESOURCE_VER
    12991268            registerResource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR
    1300             registerResource.resourceToRegister = <void *> self.cudaBuffer
     1269            resource = int(self.cudaNV12Buffer)
     1270            registerResource.resourceToRegister = <void *> resource
    13011271            registerResource.width = self.encoder_width
    13021272            registerResource.height = self.encoder_height
    1303             registerResource.pitch = self.pitch
     1273            registerResource.pitch = self.NV12Pitch
    13041274            raiseNVENC(self.functionList.nvEncRegisterResource(self.context, &registerResource), "registering CUDA input buffer")
    13051275            self.inputHandle = registerResource.registeredResource
    13061276            debug("input handle for CUDA buffer: %s", hex(<long> self.inputHandle))
     
    13371307
    13381308    def clean(self):                        #@DuplicatedSignature
    13391309        debug("clean() context=%s", hex(<long> self.context))
    1340         if self.cuda_context!=NULL:
    1341             raiseCUDA(cuCtxPushCurrent(self.cuda_context), "failed to push context")
     1310        if self.cuda_context:
     1311            self.cuda_context.push()
    13421312            try:
    13431313                self.cuda_clean()
    13441314            finally:
    1345                 raiseCUDA(cuCtxPopCurrent(&self.cuda_context), "failed to pop context")
    1346                 cuCtxDestroy(self.cuda_context)
    1347                 self.cuda_context = NULL
     1315                self.cuda_context.pop()
     1316                self.cuda_context.detach()
     1317                self.cuda_context = None
    13481318
    13491319    def cuda_clean(self):
    1350         if self.inputHandle!=NULL:
     1320        if self.inputHandle!=NULL and self.context!=NULL:
    13511321            debug("clean() unregistering %s", hex(<long> self.inputHandle))
    13521322            raiseNVENC(self.functionList.nvEncUnregisterResource(self.context, self.inputHandle), "unregistering CUDA input buffer")
    13531323            self.inputHandle = NULL
    1354         if self.inputBuffer!=NULL:
    1355             debug("clean() freeing CUDA host buffer %s", hex(<long> self.inputBuffer))
    1356             raiseCUDA(cuMemFreeHost(self.inputBuffer), "freeing host buffer")
    1357             self.inputBuffer = NULL
    1358         if (<void *> self.cudaBuffer)!=NULL:
    1359             debug("clean() freeing CUDA device buffer %s", hex(<long> self.cudaBuffer))
    1360             raiseCUDA(cuMemFree(self.cudaBuffer), "freeing CUDA buffer")
    1361             self.cudaBuffer = <CUdeviceptr> NULL
    1362         if self.bitstreamBuffer!=NULL:
    1363             debug("clean() destroying bitstream buffer %s", hex(<long> self.bitstreamBuffer))
    1364             raiseNVENC(self.functionList.nvEncDestroyBitstreamBuffer(self.context, self.bitstreamBuffer), "destroying output buffer")
    1365             self.bitstreamBuffer = NULL
    1366         raiseNVENC(self.functionList.nvEncDestroyEncoder(self.context), "destroying context")
     1324        if self.inputBuffer is not None:
     1325            debug("clean() freeing CUDA host buffer %s", self.inputBuffer)
     1326            self.inputBuffer = None
     1327        if self.cudaInputBuffer is not None:
     1328            debug("clean() freeing CUDA input buffer %s", hex(int(self.cudaInputBuffer)))
     1329            self.cudaInputBuffer.free()
     1330            self.cudaInputBuffer = None
     1331        if self.cudaNV12Buffer is not None:
     1332            debug("clean() freeing CUDA NV12 buffer %s", hex(int(self.cudaNV12Buffer)))
     1333            self.cudaNV12Buffer.free()
     1334            self.cudaNV12Buffer = None
     1335        if self.context!=NULL:
     1336            if self.bitstreamBuffer!=NULL:
     1337                debug("clean() destroying bitstream buffer %s", hex(<long> self.bitstreamBuffer))
     1338                raiseNVENC(self.functionList.nvEncDestroyBitstreamBuffer(self.context, self.bitstreamBuffer), "destroying output buffer")
     1339                self.bitstreamBuffer = NULL
     1340            debug("clean() destroying encoder %s", hex(<long> self.context))
     1341            raiseNVENC(self.functionList.nvEncDestroyEncoder(self.context), "destroying context")
    13671342
    13681343    def get_width(self):
    13691344        return self.width
     
    13971372        self.functionList.nvEncEncodePicture(self.context, &picParams)
    13981373
    13991374    def compress_image(self, image, options={}):
    1400         raiseCUDA(cuCtxPushCurrent(self.cuda_context), "failed to push context")
     1375        self.cuda_context.push()
    14011376        try:
    14021377            return self.do_compress_image(image, options)
    14031378        finally:
    1404             raiseCUDA(cuCtxPopCurrent(&self.cuda_context), "failed to pop context")
     1379            self.cuda_context.pop()
    14051380
    14061381    def do_compress_image(self, image, options={}):
    14071382        cdef const void* Y = NULL
     
    14141389        cdef NV_ENC_MAP_INPUT_RESOURCE mapInputResource
    14151390        cdef NV_ENC_LOCK_BITSTREAM lockOutputBuffer
    14161391        cdef size_t size
    1417         cdef long offset = 0
     1392        cdef int offset = 0
    14181393        cdef input_buf_len = 0
    14191394        cdef int x, y, stride
    14201395        cdef int w, h
     
    14311406        debug("compress_image(..) pixels=%s", type(pixels))
    14321407
    14331408        #copy to input buffer:
    1434         size = self.pitch * self.encoder_height * 3/2
    1435         memset(self.inputBuffer, 0, size)
     1409        size = self.inputPitch * self.encoder_height * 3/2
    14361410        #copy luma:
    1437         assert PyObject_AsReadBuffer(pixels[0], &Y, &Y_len)==0
    1438         assert PyObject_AsReadBuffer(pixels[1], &Cb, &Cb_len)==0
    1439         assert PyObject_AsReadBuffer(pixels[2], &Cr, &Cr_len)==0
    14401411        stride = strides[0]
     1412        Yplane = pixels[0]
     1413        buffer = self.inputBuffer.data
    14411414        for y in range(h):
    1442             memcpy(self.inputBuffer + y*self.pitch, Y + stride*y, w)
     1415            dst = y * self.inputPitch
     1416            src = y * stride
     1417            #debug("%s: %s:%s (size=%s) <- %s:%s (size=%s)", y, dst, dst+w, len(buffer), src, src+w, len(Yplane))
     1418            buffer[dst:(dst+w)] = Yplane[src:(src+w)]
    14431419        #copy chroma packed:
    14441420        assert strides[1]==strides[2], "U and V strides differ: %s vs %s" % (strides[1], strides[2])
    14451421        stride = strides[1]
     1422        Uplane = pixels[1]
     1423        Vplane = pixels[2]
    14461424        for y in range(h/2):
    1447             offset = (self.encoder_height + y) * self.pitch
     1425            offset = (self.encoder_height + y) * self.inputPitch
    14481426            for x in range(w/2):
    1449                 (<char*> self.inputBuffer)[offset + (x*2)] = (<char *> Cb)[stride*y + x]
    1450                 (<char*> self.inputBuffer)[offset + (x*2)+1] = (<char *> Cr)[stride*y + x]
     1427                if y==0 and x==0:
     1428                    debug("type(Uplane item)=%s=%s", Uplane[stride*y + x], type(Uplane[stride*y + x]))
     1429                buffer[offset + (x*2):offset + (x*2)+1] = Uplane[stride*y + x:stride*y + x+1]
     1430                buffer[offset + (x*2) + 1:offset + (x*2) + 2] = Vplane[stride*y + x:stride*y + x+1]
     1431        debug("compress_image(..) host buffer populated")
    14511432
    14521433        #copy input buffer to CUDA buffer:
    1453         raiseCUDA(cuMemcpyHtoD(self.cudaBuffer, self.inputBuffer, size), "copy from host to device")
     1434        self.driver.memcpy_htod(self.cudaInputBuffer, self.inputBuffer)
    14541435        debug("compress_image(..) input buffer copied to device")
    14551436
     1437        #FIXME: just for testing fill the buffer with our input already:
     1438        #self.driver.memcpy_htod(self.cudaNV12Buffer, self.inputBuffer)
     1439        #FIXME: find better values and validate against max_block/max_grid:
     1440        blockw = 16
     1441        blockh = 16
     1442        gridw = max(1, self.encoder_width/blockw)
     1443        gridh = max(1, self.encoder_height*3/2/blockh)
     1444        self.BGRA2NV12(self.cudaInputBuffer, numpy.int32(self.inputPitch),
     1445                       self.cudaNV12Buffer, numpy.int32(self.NV12Pitch),
     1446                       numpy.int32(self.width), numpy.int32(self.encoder_height*3/2),
     1447                       block=(blockw,blockh,1), grid=(gridw, gridh))
     1448        #a block is a group of threads: (blockw * blockh) threads
     1449        #a grid is a group of blocks: (gridw * gridh) blocks
     1450        debug("compress_image(..) kernel executed")
     1451
    14561452        #map buffer so nvenc can access it:
    14571453        memset(&mapInputResource, 0, sizeof(NV_ENC_MAP_INPUT_RESOURCE))
    14581454        mapInputResource.version = NV_ENC_MAP_INPUT_RESOURCE_VER
     
    14671463            picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME
    14681464            picParams.inputWidth = self.encoder_width
    14691465            picParams.inputHeight = self.encoder_height
    1470             picParams.inputPitch = self.pitch
     1466            picParams.inputPitch = self.NV12Pitch
    14711467            picParams.inputBuffer = mapInputResource.mappedResource
    14721468            picParams.outputBitstream = self.bitstreamBuffer
    14731469            #picParams.pictureType: required when enablePTD is disabled
     
    16481644                if full_query:
    16491645                    presets = self.query_presets(encode_GUID)
    16501646                    debug("  presets=%s", presets)
    1651    
     1647
    16521648                    profiles = self.query_profiles(encode_GUID)
    16531649                    debug("  profiles=%s", profiles)
    1654    
     1650
    16551651                    input_formats = self.query_input_formats(encode_GUID)
    16561652                    debug("  input formats=%s", input_formats)
    16571653        finally:
     
    16621658
    16631659    cdef open_encode_session(self):
    16641660        cdef NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS params
    1665         debug("open_encode_session(%s)", hex(<long> self.cuda_context))
     1661        debug("open_encode_session() cuda_context=%s", self.cuda_context)
     1662        debug("open_encode_session() cuda_context=%s", dir(self.cuda_context))
    16661663
    16671664        #get NVENC function pointers:
    16681665        memset(&self.functionList, 0, sizeof(NV_ENCODE_API_FUNCTION_LIST))
     
    16701667        raiseNVENC(NvEncodeAPICreateInstance(&self.functionList), "getting API function list")
    16711668        assert self.functionList.nvEncOpenEncodeSessionEx!=NULL, "looks like NvEncodeAPICreateInstance failed!"
    16721669
     1670        #get the CUDA context (C pointer):
     1671        cdef CUcontext cuda_context
     1672        cdef CUresult result
     1673        result = cuCtxGetCurrent(&cuda_context)
     1674        assert result==0, "failed to get current cuda context"
     1675
    16731676        #NVENC init:
    16741677        memset(&params, 0, sizeof(NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS))
    16751678        params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER
    16761679        params.deviceType = NV_ENC_DEVICE_TYPE_CUDA
    1677         params.device = <void*> self.cuda_context
     1680        params.device = <void*> cuda_context
    16781681        params.clientKeyPtr = &CLIENT_KEY_GUID
    16791682        params.apiVersion = NVENCAPI_VERSION
    16801683        #params.clientKeyPtr = client_key