xpra icon
Bug tracker and wiki

This bug tracker and wiki are being discontinued
please use https://github.com/Xpra-org/xpra instead.


Ticket #370: nvenc-pycuda-with-kernel.patch

File nvenc-pycuda-with-kernel.patch, 27.1 KB (added by Antoine Martin, 8 years ago)

"working" pycuda version with an empty kernel

  • xpra/codecs/nvenc/CUDA_rgb2nv12.py

     
     1# This file is part of Xpra.
     2# Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk>
     3# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
     4# later version. See the file COPYING for details.
     5
     6
     7BGRA2NV12_kernel = """
     8#include <stdint.h>
     9
     10__global__ void BGRA2NV12(uint8_t *srcImage,    int srcPitch,
     11                          uint8_t *dstImage,    int dstPitch,
     12                          int width,            int height)
     13{
     14    int32_t x, y;
     15    x = blockIdx.x * blockDim.x + threadIdx.x;
     16    y = blockIdx.y * blockDim.y + threadIdx.y;
     17
     18    if (x >= width)
     19        return;
     20    if (y >= height)
     21        return;
     22
     23    //just copy for now...
     24    //dstImage[y * dstPitch + x] = srcImage[y * srcPitch + x];
     25}
     26"""
  • xpra/codecs/nvenc/encoder.pyx

     
    66import binascii
    77import time
    88import os
     9import numpy
    910
    1011from xpra.codecs.image_wrapper import ImageWrapper
    1112from xpra.log import Logger, debug_if_env
     
    3637    void* malloc(size_t __size)
    3738    void free(void* mem)
    3839
    39 #could also use pycuda...
    4040cdef extern from "cuda.h":
    41     ctypedef int CUdevice
    4241    ctypedef int CUresult
    43     ctypedef void* CUdeviceptr
    4442    ctypedef void* CUcontext
    45     ctypedef enum CUdevice_attribute:
    46         CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
    47         CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH
    48         CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT
    49         CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
    50         CU_DEVICE_ATTRIBUTE_PCI_BUS_ID
    51         CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID
    52         CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
     43    CUresult cuCtxGetCurrent(CUcontext *pctx)
    5344
    54     CUresult cuInit(unsigned int flags)
    55     CUresult cuDeviceGet(CUdevice *device, int ordinal)
    56     CUresult cuDeviceGetCount(int *count)
    57     CUresult cuDeviceGetName(char *name, int len, CUdevice dev)
    58     CUresult cuDeviceComputeCapability(int *major, int *minor, CUdevice dev)
    59 
    60     CUresult cuDeviceTotalMem(size_t *bytes, CUdevice dev)
    61     CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev)
    62 
    63     CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
    64     CUresult cuCtxPopCurrent(CUcontext *pctx)
    65     CUresult cuCtxPushCurrent(CUcontext ctx)
    66     CUresult cuCtxDestroy(CUcontext ctx)
    67 
    68     CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
    69     CUresult cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes)
    70     CUresult cuMemAllocHost(void **pp, size_t bytesize)
    71     CUresult cuMemFree(CUdeviceptr dptr)
    72     CUresult cuMemFreeHost(void *p)
    73 
    74 
    7545cdef extern from "NvTypes.h":
    7646    pass
    7747
     
    10611031    return (n + m - 1) & ~(m - 1)
    10621032
    10631033
    1064 CUDA_STATUS_TXT = {}
    1065 cdef cudaStatusInfo(CUresult ret):
    1066     if ret in CUDA_STATUS_TXT:
    1067         return "%s: %s" % (ret, CUDA_STATUS_TXT[ret])
    1068     return str(ret)
     1034def device_info(d):
     1035    return "%s @ %s" % (d.name(), d.pci_bus_id())
    10691036
    1070 cdef checkCuda(CUresult ret, msg=""):
    1071     if ret!=0:
    1072         log.warn("error during %s: %s", msg, cudaStatusInfo(ret))
    1073     return ret
    1074 cdef raiseCUDA(CUresult ret, msg=""):
    1075     if ret!=0:
    1076         raise Exception("%s - returned %s" % (msg, cudaStatusInfo(ret)))
    1077 
    10781037cdef cuda_init_devices():
    1079     cdef int deviceCount, i
    1080     cdef CUdevice cuDevice
    1081     cdef char gpu_name[100]
    1082     cdef int SMminor, SMmajor
    1083     cdef size_t totalMem
    1084     cdef int multiProcessorCount
    1085     cdef int max_width, max_height
    1086     cdef int canMapHostMemory
    1087     cdef int computeMode
    1088     cdef int pciBusID, pciDeviceID
    1089     cdef CUresult r
    10901038    start = time.time()
    10911039    log.info("CUDA initialization (this may take a few seconds)")
    1092     raiseCUDA(cuInit(0), "cuInit")
    1093     debug("cuda_init_devices() cuInit() took %.1fms", 1000.0*(time.time()-start))
    1094     raiseCUDA(cuDeviceGetCount(&deviceCount), "failed to get device count")
    1095     debug("cuda_init_devices() found %s devices", deviceCount)
     1040    import pycuda
     1041    from pycuda import driver
     1042    driver.init()
     1043    ngpus = driver.Device.count()
     1044    debug("PyCUDA found %s devices:", ngpus)
    10961045    devices = {}
    1097     for i in range(deviceCount):
    1098         r = cuDeviceGet(&cuDevice, i)
    1099         checkCuda(r, "cuDeviceGet")
    1100         if r!=0:
    1101             continue
    1102         checkCuda(cuDeviceGetName(gpu_name, 100, cuDevice), "cuDeviceGetName")
    1103         checkCuda(cuDeviceComputeCapability(&SMmajor, &SMminor, i), "cuDeviceComputeCapability")
     1046    for i in range(ngpus):
     1047        d = driver.Device(i)
     1048        mem = d.total_memory()
     1049        host_mem = d.get_attribute(driver.device_attribute.CAN_MAP_HOST_MEMORY)
     1050        #SMmajor, SMminor = d.compute_cabability()
     1051        SMmajor, SMminor = 0xFFFF, 0xFFFF
    11041052        has_nvenc = ((SMmajor<<4) + SMminor) >= 0x30
    1105         cuDeviceGetAttribute(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cuDevice)
    1106         cuDeviceGetAttribute(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cuDevice)
    1107         raiseCUDA(cuDeviceTotalMem(&totalMem, cuDevice), "cuDeviceTotalMem")
    1108         debug("device[%s]=%s (%sMB) - PCI: %02d:%02d - compute %s.%s (nvenc=%s)",
    1109                 i, gpu_name, int(totalMem/1024/1024), pciBusID, pciDeviceID, SMmajor, SMminor, has_nvenc)
    1110         devices[i] = "%s - PCI: %02d:%02d" % (gpu_name, pciBusID, pciDeviceID)
    1111         cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice)
    1112         #log.info("multiProcessorCount=%s", multiProcessorCount)
    1113         #printf("  (%2d) Multiprocessors x (%3d) CUDA Cores/MP:    %d CUDA Cores\n",
    1114                 #multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
    1115                 #_ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
    1116         cuDeviceGetAttribute(&max_width, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, cuDevice)
    1117         cuDeviceGetAttribute(&max_height, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, cuDevice)
    1118         cuDeviceGetAttribute(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)
    1119         cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cuDevice)
    1120         debug(" Can Map Host Memory: %s, Compute Mode: %s, MultiProcessor Count: %s, max dimensions: %sx%s", canMapHostMemory, computeMode, multiProcessorCount, max_width, max_height)
     1053        pre = "-"
     1054        if host_mem and has_nvenc:
     1055            pre = "+"
     1056            devices[i] = device_info(d)
     1057        debug(" %s %s (%sMB)", pre, device_info(d), mem/1024/1024)
     1058        max_width = d.get_attribute(driver.device_attribute.MAXIMUM_TEXTURE2D_WIDTH)
     1059        max_height = d.get_attribute(driver.device_attribute.MAXIMUM_TEXTURE2D_HEIGHT)
     1060        debug(" Can Map Host Memory: %s, Compute Mode: %s, max dimensions: %sx%s", host_mem, (SMmajor, SMminor), max_width, max_height)
    11211061    end = time.time()
    11221062    debug("cuda_init_devices() took %.1fms", 1000.0*(end-start))
    11231063    return devices
     
    11311071            log.info(" + %s", cuda_devices.get(device_id))
    11321072    return cuda_devices
    11331073
    1134 cdef CUdevice get_cuda_device(deviceId=0):
    1135     global cuda_devices
    1136     cdef CUdevice cuDevice            #@DuplicatedSignature
    1137     cdef char gpu_name[100]           #@DuplicatedSignature
    1138     cdef int SMminor, SMmajor         #@DuplicatedSignature
    1139     if deviceId<0:
    1140         deviceId = 0
    1141     if deviceId not in cuda_devices:
    1142         raise Exception("invalid deviceId %s: only %s devices found" % (deviceId, len(cuda_devices)))
    1143     raiseCUDA(cuDeviceGet(&cuDevice, deviceId), "cuDeviceGet")
    1144     raiseCUDA(cuDeviceGetName(gpu_name, 100, cuDevice), "cuDeviceGetName")
    1145     debug("using CUDA device %s: %s", deviceId, gpu_name)
    1146     raiseCUDA(cuDeviceComputeCapability(&SMmajor, &SMminor, deviceId))
    1147     has_nvenc = ((SMmajor<<4) + SMminor) >= 0x30
    1148     if FORCE and not has_nvenc:
    1149         log.warn("selected device %s does not have NVENC capability!" % gpu_name)
    1150     else:
    1151         assert has_nvenc, "selected device %s does not have NVENC capability!" % gpu_name
    1152     return cuDevice
    1153 
    11541074DEFAULT_CUDA_DEVICE_ID = int(os.environ.get("XPRA_CUDA_DEVICE", "0"))
    11551075
    11561076def cuda_check():
    1157     cdef CUcontext context
    1158     cdef CUdevice cuDevice              #@DuplicatedSignature
    1159     cuda_devices = get_cuda_devices()
    1160     if len(cuda_devices)==0:
     1077    global DEFAULT_CUDA_DEVICE_ID
     1078    devices = get_cuda_devices()
     1079    if len(devices)==0:
    11611080        raise ImportError("no CUDA devices found!")
    1162     assert DEFAULT_CUDA_DEVICE_ID in cuda_devices.keys(), "specified CUDA device ID %s not found in %s" % (DEFAULT_CUDA_DEVICE_ID, cuda_devices)
    1163     cuDevice = get_cuda_device(DEFAULT_CUDA_DEVICE_ID)
     1081    assert DEFAULT_CUDA_DEVICE_ID in cuda_devices.keys(), "specified CUDA device ID %s not found in %s" % (DEFAULT_CUDA_DEVICE_ID, devices)
    11641082
    1165     raiseCUDA(cuCtxCreate(&context, 0, cuDevice), "creating CUDA context")
    1166     raiseCUDA(cuCtxPopCurrent(&context), "popping current context")
    1167     raiseCUDA(cuCtxDestroy(context), "destroying current context")
     1083    #create context for testing:
     1084    from pycuda import driver
     1085    d = driver.Device(DEFAULT_CUDA_DEVICE_ID)
     1086    context = d.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST)
     1087    debug("cuda_check created test context, api_version=%s", context.get_api_version())
     1088    context.pop()
     1089    context.detach()
    11681090
    11691091
    11701092cdef nvencStatusInfo(NVENCSTATUS ret):
     
    11771099        raise Exception("%s - returned %s" % (msg, nvencStatusInfo(ret)))
    11781100
    11791101
     1102#BGRA2NV12_functions = {}
     1103def get_BGRA2NV12():
     1104    #global BGRA2NV12_function
     1105    #if BGRA2NV12_function:
     1106    #    return  BGRA2NV12_function
     1107    from xpra.codecs.nvenc.CUDA_rgb2nv12 import BGRA2NV12_kernel
     1108    from pycuda.compiler import SourceModule
     1109    log.info("BGRA2NV12=%s", BGRA2NV12_kernel)
     1110    mod = SourceModule(BGRA2NV12_kernel)
     1111    BGRA2NV12_function = mod.get_function("BGRA2NV12")
     1112    return BGRA2NV12_function
     1113
     1114#import pycuda.driver.PointerHolderBase
     1115#class CustomCUDABuffer(pycuda.driver.PointerHolderBase):
     1116#    def __init__(self, pointer):
     1117#        self.pointer = pointer
     1118#    def get_pointer(self):
     1119#        return self.pointer
     1120
    11801121cdef class Encoder:
    11811122    cdef int width
    11821123    cdef int height
    11831124    cdef int encoder_width
    11841125    cdef int encoder_height
    11851126    cdef object src_format
    1186     cdef CUcontext cuda_context
     1127    #PyCUDA:
     1128    cdef object driver
     1129    cdef object cuda_device
     1130    cdef object cuda_context
     1131    cdef object BGRA2NV12
     1132    #NVENC:
    11871133    cdef NV_ENCODE_API_FUNCTION_LIST functionList               #@DuplicatedSignature
    11881134    cdef void *context
    11891135    cdef NV_ENC_REGISTERED_PTR inputHandle
    1190     cdef CUdeviceptr cudaBuffer
    1191     cdef void *inputBuffer
    1192     cdef size_t pitch
     1136    cdef object inputBuffer
     1137    cdef object cudaInputBuffer
     1138    cdef object cudaNV12Buffer
     1139    cdef int inputPitch
     1140    cdef int NV12Pitch
    11931141    cdef void *bitstreamBuffer
    11941142    cdef NV_ENC_BUFFER_FORMAT bufferFmt
    11951143    cdef object codec_name
     
    12261174        self.codec_name = "H264"
    12271175        self.preset_name = None
    12281176        self.frames = 0
    1229         self.cuda_context = NULL
     1177        self.cuda_device = None
     1178        self.cuda_context = None
    12301179        start = time.time()
    12311180
    12321181        device_id = options.get("cuda_device", DEFAULT_CUDA_DEVICE_ID)
     
    12371186
    12381187    def init_cuda(self, device_id):
    12391188        assert device_id in get_cuda_devices().keys(), "invalid device_id '%s' (available: %s)" % (device_id, cuda_devices)
    1240         cdef CUdevice cuda_device              #@DuplicatedSignature
    1241         cuda_device = get_cuda_device(DEFAULT_CUDA_DEVICE_ID)
    1242         raiseCUDA(cuCtxCreate(&self.cuda_context, 0, cuda_device))
    1243         debug("cuCtxCreate: device_id=%s, cuda_device=%s, cuda_context=%s", device_id, cuda_device, hex(<long> self.cuda_context))
    1244         #allocate CUDA input buffer (on device):
    1245         raiseCUDA(cuMemAllocPitch(&self.cudaBuffer, &self.pitch, self.encoder_width, self.encoder_height*3/2, 16), "allocating CUDA input buffer on device")
    1246         debug("cudaBuffer=%s, pitch=%s", hex(<long> self.cudaBuffer), self.pitch)
    1247         #allocate buffer on host:
    1248         raiseCUDA(cuMemAllocHost(&self.inputBuffer, self.pitch*self.encoder_height*3/2), "allocating CUDA input buffer on host")
    1249         debug("inputBuffer=%s", hex(<long> self.inputBuffer))
     1189        from pycuda import driver
     1190        self.driver = driver
     1191        debug("init_cuda(%s)", device_id)
     1192        self.cuda_device = driver.Device(DEFAULT_CUDA_DEVICE_ID)
     1193        self.cuda_context = self.cuda_device.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST)
     1194        try:
     1195            debug("init_cuda(%s) cuda_device=%s, cuda_context=%s", device_id, self.cuda_device, self.cuda_context)
     1196            #compile/get kernel:
     1197            self.BGRA2NV12 = get_BGRA2NV12()
     1198            #allocate CUDA input buffer (on device):
     1199            self.cudaInputBuffer, self.inputPitch = driver.mem_alloc_pitch(self.encoder_width, self.encoder_height*3/2, 16)
     1200            debug("CUDA Input Buffer=%s, pitch=%s", hex(int(self.cudaInputBuffer)), self.inputPitch)
     1201            #allocate CUDA NV12 buffer (on device):
     1202            self.cudaNV12Buffer, self.NV12Pitch = driver.mem_alloc_pitch(self.encoder_width, self.encoder_height*3/2, 16)
     1203            debug("CUDA NV12 Buffer=%s, pitch=%s", hex(int(self.cudaNV12Buffer)), self.NV12Pitch)
     1204            #allocate input buffer on host:
     1205            #self.inputBuffer = driver.pagelocked_empty(self.encoder_width*self.encoder_height, dtype=numpy.byte)
     1206            self.inputBuffer = driver.pagelocked_zeros(self.inputPitch*self.encoder_height*3/2, dtype=numpy.byte)
     1207            debug("inputBuffer=%s", self.inputBuffer)
     1208   
     1209            self.init_nvenc()
     1210        finally:
     1211            self.cuda_context.pop()
    12501212
    1251         self.init_nvenc()
    1252 
    1253         raiseCUDA(cuCtxPopCurrent(&self.cuda_context), "failed to pop context")
    1254 
    12551213    def init_nvenc(self):
    12561214        cdef GUID codec
    12571215        cdef GUID preset
     
    12591217        cdef NV_ENC_INITIALIZE_PARAMS params
    12601218        cdef NV_ENC_PRESET_CONFIG *presetConfig     #@DuplicatedSignature
    12611219        cdef NV_ENC_REGISTER_RESOURCE registerResource
     1220        cdef NV_ENC_CREATE_INPUT_BUFFER createInputBufferParams
     1221        cdef NV_ENC_CREATE_BITSTREAM_BUFFER createBitstreamBufferParams
     1222        cdef long resource
    12621223
    12631224        self.open_encode_session()
    12641225        codec = self.get_codec()
     
    12681229        input_format = BUFFER_FORMAT[self.bufferFmt]
    12691230        input_formats = self.query_input_formats(codec)
    12701231        assert input_format in input_formats, "%s does not support %s (only: %s)" %  (self.codec_name, input_format, input_formats)
    1271 
    1272         cdef NV_ENC_CREATE_INPUT_BUFFER createInputBufferParams
    1273         cdef NV_ENC_CREATE_BITSTREAM_BUFFER createBitstreamBufferParams
    12741232        try:
    12751233            presetConfig = self.get_preset_config(codec, preset)
    12761234
     
    12971255            memset(&registerResource, 0, sizeof(NV_ENC_REGISTER_RESOURCE))
    12981256            registerResource.version = NV_ENC_REGISTER_RESOURCE_VER
    12991257            registerResource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR
    1300             registerResource.resourceToRegister = <void *> self.cudaBuffer
     1258            resource = int(self.cudaNV12Buffer)
     1259            registerResource.resourceToRegister = <void *> resource
    13011260            registerResource.width = self.encoder_width
    13021261            registerResource.height = self.encoder_height
    1303             registerResource.pitch = self.pitch
     1262            registerResource.pitch = self.NV12Pitch
    13041263            raiseNVENC(self.functionList.nvEncRegisterResource(self.context, &registerResource), "registering CUDA input buffer")
    13051264            self.inputHandle = registerResource.registeredResource
    13061265            debug("input handle for CUDA buffer: %s", hex(<long> self.inputHandle))
     
    13371296
    13381297    def clean(self):                        #@DuplicatedSignature
    13391298        debug("clean() context=%s", hex(<long> self.context))
    1340         if self.cuda_context!=NULL:
    1341             raiseCUDA(cuCtxPushCurrent(self.cuda_context), "failed to push context")
     1299        if self.cuda_context:
     1300            self.cuda_context.push()
    13421301            try:
    13431302                self.cuda_clean()
    13441303            finally:
    1345                 raiseCUDA(cuCtxPopCurrent(&self.cuda_context), "failed to pop context")
    1346                 cuCtxDestroy(self.cuda_context)
    1347                 self.cuda_context = NULL
     1304                self.cuda_context.pop()
     1305                self.cuda_context.detach()
     1306                self.cuda_context = None
    13481307
    13491308    def cuda_clean(self):
    1350         if self.inputHandle!=NULL:
     1309        if self.inputHandle!=NULL and self.context!=NULL:
    13511310            debug("clean() unregistering %s", hex(<long> self.inputHandle))
    13521311            raiseNVENC(self.functionList.nvEncUnregisterResource(self.context, self.inputHandle), "unregistering CUDA input buffer")
    13531312            self.inputHandle = NULL
    1354         if self.inputBuffer!=NULL:
    1355             debug("clean() freeing CUDA host buffer %s", hex(<long> self.inputBuffer))
    1356             raiseCUDA(cuMemFreeHost(self.inputBuffer), "freeing host buffer")
    1357             self.inputBuffer = NULL
    1358         if (<void *> self.cudaBuffer)!=NULL:
    1359             debug("clean() freeing CUDA device buffer %s", hex(<long> self.cudaBuffer))
    1360             raiseCUDA(cuMemFree(self.cudaBuffer), "freeing CUDA buffer")
    1361             self.cudaBuffer = <CUdeviceptr> NULL
    1362         if self.bitstreamBuffer!=NULL:
    1363             debug("clean() destroying bitstream buffer %s", hex(<long> self.bitstreamBuffer))
    1364             raiseNVENC(self.functionList.nvEncDestroyBitstreamBuffer(self.context, self.bitstreamBuffer), "destroying output buffer")
    1365             self.bitstreamBuffer = NULL
    1366         raiseNVENC(self.functionList.nvEncDestroyEncoder(self.context), "destroying context")
     1313        if self.inputBuffer is not None:
     1314            debug("clean() freeing CUDA host buffer %s", self.inputBuffer)
     1315            self.inputBuffer = None
     1316        if self.cudaInputBuffer is not None:
     1317            debug("clean() freeing CUDA input buffer %s", hex(int(self.cudaInputBuffer)))
     1318            self.cudaInputBuffer.free()
     1319            self.cudaInputBuffer = None
     1320        if self.cudaNV12Buffer is not None:
     1321            debug("clean() freeing CUDA NV12 buffer %s", hex(int(self.cudaNV12Buffer)))
     1322            self.cudaNV12Buffer.free()
     1323            self.cudaNV12Buffer = None
     1324        if self.context!=NULL:
     1325            if self.bitstreamBuffer!=NULL:
     1326                debug("clean() destroying bitstream buffer %s", hex(<long> self.bitstreamBuffer))
     1327                raiseNVENC(self.functionList.nvEncDestroyBitstreamBuffer(self.context, self.bitstreamBuffer), "destroying output buffer")
     1328                self.bitstreamBuffer = NULL
     1329            debug("clean() destroying encoder %s", hex(<long> self.context))
     1330            raiseNVENC(self.functionList.nvEncDestroyEncoder(self.context), "destroying context")
    13671331
    13681332    def get_width(self):
    13691333        return self.width
     
    13971361        self.functionList.nvEncEncodePicture(self.context, &picParams)
    13981362
    13991363    def compress_image(self, image, options={}):
    1400         raiseCUDA(cuCtxPushCurrent(self.cuda_context), "failed to push context")
     1364        self.cuda_context.push()
    14011365        try:
    14021366            return self.do_compress_image(image, options)
    14031367        finally:
    1404             raiseCUDA(cuCtxPopCurrent(&self.cuda_context), "failed to pop context")
     1368            self.cuda_context.pop()
    14051369
    14061370    def do_compress_image(self, image, options={}):
    14071371        cdef const void* Y = NULL
     
    14311395        debug("compress_image(..) pixels=%s", type(pixels))
    14321396
    14331397        #copy to input buffer:
    1434         size = self.pitch * self.encoder_height * 3/2
    1435         memset(self.inputBuffer, 0, size)
     1398        size = self.inputPitch * self.encoder_height * 3/2
    14361399        #copy luma:
    1437         assert PyObject_AsReadBuffer(pixels[0], &Y, &Y_len)==0
    1438         assert PyObject_AsReadBuffer(pixels[1], &Cb, &Cb_len)==0
    1439         assert PyObject_AsReadBuffer(pixels[2], &Cr, &Cr_len)==0
    14401400        stride = strides[0]
     1401        Yplane = pixels[0]
     1402        buffer = self.inputBuffer.data
    14411403        for y in range(h):
    1442             memcpy(self.inputBuffer + y*self.pitch, Y + stride*y, w)
     1404            dst = y * self.inputPitch
     1405            src = y * stride
     1406            #debug("%s: %s:%s (size=%s) <- %s:%s (size=%s)", y, dst, dst+w, len(buffer), src, src+w, len(Yplane))
     1407            buffer[dst:(dst+w)] = Yplane[src:(src+w)]
    14431408        #copy chroma packed:
    14441409        assert strides[1]==strides[2], "U and V strides differ: %s vs %s" % (strides[1], strides[2])
    14451410        stride = strides[1]
     1411        Uplane = pixels[1]
     1412        Vplane = pixels[2]
    14461413        for y in range(h/2):
    1447             offset = (self.encoder_height + y) * self.pitch
     1414            offset = (self.encoder_height + y) * self.inputPitch
    14481415            for x in range(w/2):
    1449                 (<char*> self.inputBuffer)[offset + (x*2)] = (<char *> Cb)[stride*y + x]
    1450                 (<char*> self.inputBuffer)[offset + (x*2)+1] = (<char *> Cr)[stride*y + x]
     1416                buffer[offset + (x*2)] = numpy.byte(Uplane[stride*y + x])
     1417                buffer[offset + (x*2) + 1] = numpy.byte(Vplane[stride*y + x])
     1418        debug("compress_image(..) host buffer populated")
    14511419
    14521420        #copy input buffer to CUDA buffer:
    1453         raiseCUDA(cuMemcpyHtoD(self.cudaBuffer, self.inputBuffer, size), "copy from host to device")
     1421        self.driver.memcpy_htod(self.cudaInputBuffer, self.inputBuffer)
    14541422        debug("compress_image(..) input buffer copied to device")
    14551423
     1424        #FIXME: just for testing fill the buffer with our input already:
     1425        self.driver.memcpy_htod(self.cudaNV12Buffer, self.inputBuffer)
     1426        self.BGRA2NV12(self.cudaInputBuffer, numpy.int32(self.inputPitch),
     1427                       self.cudaNV12Buffer, numpy.int32(self.NV12Pitch),
     1428                       numpy.int32(self.width), numpy.int32(self.height),
     1429                       block=(16,16,1))
     1430        debug("compress_image(..) kernel executed")
     1431
    14561432        #map buffer so nvenc can access it:
    14571433        memset(&mapInputResource, 0, sizeof(NV_ENC_MAP_INPUT_RESOURCE))
    14581434        mapInputResource.version = NV_ENC_MAP_INPUT_RESOURCE_VER
    14591435        mapInputResource.registeredResource  = self.inputHandle
    1460         raiseCUDA(self.functionList.nvEncMapInputResource(self.context, &mapInputResource), "mapping input resource")
     1436        raiseNVENC(self.functionList.nvEncMapInputResource(self.context, &mapInputResource), "mapping input resource")
    14611437        debug("compress_image(..) device buffer mapped to %s", hex(<long> mapInputResource.mappedResource))
    14621438
    14631439        try:
     
    14671443            picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME
    14681444            picParams.inputWidth = self.encoder_width
    14691445            picParams.inputHeight = self.encoder_height
    1470             picParams.inputPitch = self.pitch
     1446            picParams.inputPitch = self.NV12Pitch
    14711447            picParams.inputBuffer = mapInputResource.mappedResource
    14721448            picParams.outputBitstream = self.bitstreamBuffer
    14731449            #picParams.pictureType: required when enablePTD is disabled
     
    15031479            pixels = (<char *> lockOutputBuffer.bitstreamBufferPtr)[:size]
    15041480        finally:
    15051481            raiseNVENC(self.functionList.nvEncUnlockBitstream(self.context, self.bitstreamBuffer), "unlocking output buffer")
    1506             raiseCUDA(self.functionList.nvEncUnmapInputResource(self.context, mapInputResource.mappedResource), "unmapping input resource")
     1482            raiseNVENC(self.functionList.nvEncUnmapInputResource(self.context, mapInputResource.mappedResource), "unmapping input resource")
    15071483
    15081484        end = time.time()
    15091485        self.frames += 1
     
    15371513        preset_GUIDs = <GUID*> malloc(sizeof(GUID) * presetCount)
    15381514        assert preset_GUIDs!=NULL, "could not allocate memory for %s preset GUIDs!" % (presetCount)
    15391515        try:
    1540             raiseNVENC(self.functionList.nvEncGetEncodePresetGUIDs(self.context, encode_GUID, preset_GUIDs, presetCount, &presetsRetCount))
     1516            raiseNVENC(self.functionList.nvEncGetEncodePresetGUIDs(self.context, encode_GUID, preset_GUIDs, presetCount, &presetsRetCount), "getting encode presets")
    15411517            assert presetsRetCount==presetCount
    15421518            for x in range(presetCount):
    15431519                preset_GUID = preset_GUIDs[x]
     
    15681544        assert profile_GUIDs!=NULL, "could not allocate memory for %s profile GUIDs!" % (profileCount)
    15691545        PROFILES_GUIDS = CODEC_PROFILES_GUIDS.get(guidstr(encode_GUID), {})
    15701546        try:
    1571             raiseNVENC(self.functionList.nvEncGetEncodeProfileGUIDs(self.context, encode_GUID, profile_GUIDs, profileCount, &profilesRetCount))
     1547            raiseNVENC(self.functionList.nvEncGetEncodeProfileGUIDs(self.context, encode_GUID, profile_GUIDs, profileCount, &profilesRetCount), "getting encode profiles")
    15721548            #(void* encoder, GUID encodeGUID, GUID* profileGUIDs, uint32_t guidArraySize, uint32_t* GUIDCount)
    15731549            assert profilesRetCount==profileCount
    15741550            for x in range(profileCount):
     
    16131589        encCaps.version = NV_ENC_CAPS_PARAM_VER
    16141590        encCaps.capsToQuery = caps_type
    16151591
    1616         raiseNVENC(self.functionList.nvEncGetEncodeCaps(self.context, encodeGUID, &encCaps, &val))
     1592        raiseNVENC(self.functionList.nvEncGetEncodeCaps(self.context, encodeGUID, &encCaps, &val), "getting encode caps")
    16171593        return val
    16181594
    16191595    cdef query_codecs(self, full_query=False):
     
    16221598        cdef GUID* encode_GUIDs
    16231599        cdef GUID encode_GUID
    16241600
    1625         raiseNVENC(self.functionList.nvEncGetEncodeGUIDCount(self.context, &GUIDCount))
     1601        raiseNVENC(self.functionList.nvEncGetEncodeGUIDCount(self.context, &GUIDCount), "getting encoder count")
    16261602        debug("found %s encode GUIDs", GUIDCount)
    16271603        assert GUIDCount<2**8
    16281604        encode_GUIDs = <GUID*> malloc(sizeof(GUID) * GUIDCount)
     
    16621638
    16631639    cdef open_encode_session(self):
    16641640        cdef NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS params
    1665         debug("open_encode_session(%s)", hex(<long> self.cuda_context))
     1641        debug("open_encode_session() cuda_context=%s", self.cuda_context)
     1642        debug("open_encode_session() cuda_context=%s", dir(self.cuda_context))
    16661643
    16671644        #get NVENC function pointers:
    16681645        memset(&self.functionList, 0, sizeof(NV_ENCODE_API_FUNCTION_LIST))
     
    16701647        raiseNVENC(NvEncodeAPICreateInstance(&self.functionList), "getting API function list")
    16711648        assert self.functionList.nvEncOpenEncodeSessionEx!=NULL, "looks like NvEncodeAPICreateInstance failed!"
    16721649
     1650        #get the CUDA context (C pointer):
     1651        cdef CUcontext cuda_context
     1652        cdef CUresult result
     1653        result = cuCtxGetCurrent(&cuda_context)
     1654        assert result==0, "failed to get current cuda context"
     1655
    16731656        #NVENC init:
    16741657        memset(&params, 0, sizeof(NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS))
    16751658        params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER
    16761659        params.deviceType = NV_ENC_DEVICE_TYPE_CUDA
    1677         params.device = <void*> self.cuda_context
     1660        params.device = <void*> cuda_context
    16781661        params.clientKeyPtr = &CLIENT_KEY_GUID
    16791662        params.apiVersion = NVENCAPI_VERSION
    16801663        #params.clientKeyPtr = client_key