xpra icon
Bug tracker and wiki

This bug tracker and wiki are being discontinued
please use https://github.com/Xpra-org/xpra instead.


Ticket #370: nvenc-pycuda.patch

File nvenc-pycuda.patch, 23.6 KB (added by Antoine Martin, 8 years ago)

use pycuda to remove lots of code... except this does not work because we need a context pointer for nvenc :(

  • xpra/codecs/nvenc/CUDA_rgb2nv12.py

     
     1# This file is part of Xpra.
     2# Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk>
     3# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
     4# later version. See the file COPYING for details.
     5
     6
     7BGRA2NV12_kernel = """
     8#include <stdint.h>
     9
     10__global__ void BGRA2NV12(uint8_t *srcImage,     size_t srcPitch,
     11                          uint8_t *dstImage,     size_t dstPitch,
     12                          uint32_t width,         uint32_t height)
     13{
     14    int32_t x, y;
     15    x = blockIdx.x * blockDim.x + threadIdx.x;
     16    y = blockIdx.y * blockDim.y + threadIdx.y;
     17
     18    if (x >= width)
     19        return;
     20    if (y >= height)
     21        return;
     22
     23    //just copy for now...
     24    dstImage[y * dstPitch + x] = srcImage[y * srcPitch + x];
     25}
     26"""
     27 No newline at end of file
  • xpra/codecs/nvenc/encoder.pyx

     
    66import binascii
    77import time
    88import os
     9import numpy
    910
    1011from xpra.codecs.image_wrapper import ImageWrapper
    1112from xpra.log import Logger, debug_if_env
     
    3637    void* malloc(size_t __size)
    3738    void free(void* mem)
    3839
    39 #could also use pycuda...
    40 cdef extern from "cuda.h":
    41     ctypedef int CUdevice
    42     ctypedef int CUresult
    43     ctypedef void* CUdeviceptr
    44     ctypedef void* CUcontext
    45     ctypedef enum CUdevice_attribute:
    46         CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
    47         CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH
    48         CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT
    49         CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
    50         CU_DEVICE_ATTRIBUTE_PCI_BUS_ID
    51         CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID
    52         CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
    53 
    54     CUresult cuInit(unsigned int flags)
    55     CUresult cuDeviceGet(CUdevice *device, int ordinal)
    56     CUresult cuDeviceGetCount(int *count)
    57     CUresult cuDeviceGetName(char *name, int len, CUdevice dev)
    58     CUresult cuDeviceComputeCapability(int *major, int *minor, CUdevice dev)
    59 
    60     CUresult cuDeviceTotalMem(size_t *bytes, CUdevice dev)
    61     CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev)
    62 
    63     CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
    64     CUresult cuCtxPopCurrent(CUcontext *pctx)
    65     CUresult cuCtxPushCurrent(CUcontext ctx)
    66     CUresult cuCtxDestroy(CUcontext ctx)
    67 
    68     CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
    69     CUresult cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes)
    70     CUresult cuMemAllocHost(void **pp, size_t bytesize)
    71     CUresult cuMemFree(CUdeviceptr dptr)
    72     CUresult cuMemFreeHost(void *p)
    73 
    74 
    7540cdef extern from "NvTypes.h":
    7641    pass
    7742
     
    10611026    return (n + m - 1) & ~(m - 1)
    10621027
    10631028
    1064 CUDA_STATUS_TXT = {}
    1065 cdef cudaStatusInfo(CUresult ret):
    1066     if ret in CUDA_STATUS_TXT:
    1067         return "%s: %s" % (ret, CUDA_STATUS_TXT[ret])
    1068     return str(ret)
     1029def device_info(d):
     1030    return "%s @ %s" % (d.name(), d.pci_bus_id())
    10691031
    1070 cdef checkCuda(CUresult ret, msg=""):
    1071     if ret!=0:
    1072         log.warn("error during %s: %s", msg, cudaStatusInfo(ret))
    1073     return ret
    1074 cdef raiseCUDA(CUresult ret, msg=""):
    1075     if ret!=0:
    1076         raise Exception("%s - returned %s" % (msg, cudaStatusInfo(ret)))
    1077 
    10781032cdef cuda_init_devices():
    1079     cdef int deviceCount, i
    1080     cdef CUdevice cuDevice
    1081     cdef char gpu_name[100]
    1082     cdef int SMminor, SMmajor
    1083     cdef size_t totalMem
    1084     cdef int multiProcessorCount
    1085     cdef int max_width, max_height
    1086     cdef int canMapHostMemory
    1087     cdef int computeMode
    1088     cdef int pciBusID, pciDeviceID
    1089     cdef CUresult r
    10901033    start = time.time()
    10911034    log.info("CUDA initialization (this may take a few seconds)")
    1092     raiseCUDA(cuInit(0), "cuInit")
    1093     debug("cuda_init_devices() cuInit() took %.1fms", 1000.0*(time.time()-start))
    1094     raiseCUDA(cuDeviceGetCount(&deviceCount), "failed to get device count")
    1095     debug("cuda_init_devices() found %s devices", deviceCount)
     1035    import pycuda
     1036    from pycuda import driver
     1037    driver.init()
     1038    ngpus = driver.Device.count()
     1039    debug("PyCUDA found %s devices:", ngpus)
    10961040    devices = {}
    1097     for i in range(deviceCount):
    1098         r = cuDeviceGet(&cuDevice, i)
    1099         checkCuda(r, "cuDeviceGet")
    1100         if r!=0:
    1101             continue
    1102         checkCuda(cuDeviceGetName(gpu_name, 100, cuDevice), "cuDeviceGetName")
    1103         checkCuda(cuDeviceComputeCapability(&SMmajor, &SMminor, i), "cuDeviceComputeCapability")
     1041    for i in range(ngpus):
     1042        d = driver.Device(i)
     1043        mem = d.total_memory()
     1044        host_mem = d.get_attribute(driver.device_attribute.CAN_MAP_HOST_MEMORY)
     1045        #SMmajor, SMminor = d.compute_cabability()
     1046        SMmajor, SMminor = 0xFFFF, 0xFFFF
    11041047        has_nvenc = ((SMmajor<<4) + SMminor) >= 0x30
    1105         cuDeviceGetAttribute(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cuDevice)
    1106         cuDeviceGetAttribute(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cuDevice)
    1107         raiseCUDA(cuDeviceTotalMem(&totalMem, cuDevice), "cuDeviceTotalMem")
    1108         debug("device[%s]=%s (%sMB) - PCI: %02d:%02d - compute %s.%s (nvenc=%s)",
    1109                 i, gpu_name, int(totalMem/1024/1024), pciBusID, pciDeviceID, SMmajor, SMminor, has_nvenc)
    1110         devices[i] = "%s - PCI: %02d:%02d" % (gpu_name, pciBusID, pciDeviceID)
    1111         cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice)
    1112         #log.info("multiProcessorCount=%s", multiProcessorCount)
    1113         #printf("  (%2d) Multiprocessors x (%3d) CUDA Cores/MP:    %d CUDA Cores\n",
    1114                 #multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
    1115                 #_ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
    1116         cuDeviceGetAttribute(&max_width, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, cuDevice)
    1117         cuDeviceGetAttribute(&max_height, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, cuDevice)
    1118         cuDeviceGetAttribute(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)
    1119         cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cuDevice)
    1120         debug(" Can Map Host Memory: %s, Compute Mode: %s, MultiProcessor Count: %s, max dimensions: %sx%s", canMapHostMemory, computeMode, multiProcessorCount, max_width, max_height)
     1048        pre = "-"
     1049        if host_mem and has_nvenc:
     1050            pre = "+"
     1051            devices[i] = device_info(d)
     1052        debug(" %s %s (%sMB)", pre, device_info(d), mem/1024/1024)
     1053        max_width = d.get_attribute(driver.device_attribute.MAXIMUM_TEXTURE2D_WIDTH)
     1054        max_height = d.get_attribute(driver.device_attribute.MAXIMUM_TEXTURE2D_HEIGHT)
     1055        debug(" Can Map Host Memory: %s, Compute Mode: %s, max dimensions: %sx%s", host_mem, (SMmajor, SMminor), max_width, max_height)
    11211056    end = time.time()
    11221057    debug("cuda_init_devices() took %.1fms", 1000.0*(end-start))
    11231058    return devices
     
    11311066            log.info(" + %s", cuda_devices.get(device_id))
    11321067    return cuda_devices
    11331068
    1134 cdef CUdevice get_cuda_device(deviceId=0):
    1135     global cuda_devices
    1136     cdef CUdevice cuDevice            #@DuplicatedSignature
    1137     cdef char gpu_name[100]           #@DuplicatedSignature
    1138     cdef int SMminor, SMmajor         #@DuplicatedSignature
    1139     if deviceId<0:
    1140         deviceId = 0
    1141     if deviceId not in cuda_devices:
    1142         raise Exception("invalid deviceId %s: only %s devices found" % (deviceId, len(cuda_devices)))
    1143     raiseCUDA(cuDeviceGet(&cuDevice, deviceId), "cuDeviceGet")
    1144     raiseCUDA(cuDeviceGetName(gpu_name, 100, cuDevice), "cuDeviceGetName")
    1145     debug("using CUDA device %s: %s", deviceId, gpu_name)
    1146     raiseCUDA(cuDeviceComputeCapability(&SMmajor, &SMminor, deviceId))
    1147     has_nvenc = ((SMmajor<<4) + SMminor) >= 0x30
    1148     if FORCE and not has_nvenc:
    1149         log.warn("selected device %s does not have NVENC capability!" % gpu_name)
    1150     else:
    1151         assert has_nvenc, "selected device %s does not have NVENC capability!" % gpu_name
    1152     return cuDevice
    1153 
    11541069DEFAULT_CUDA_DEVICE_ID = int(os.environ.get("XPRA_CUDA_DEVICE", "0"))
    11551070
    11561071def cuda_check():
    1157     cdef CUcontext context
    1158     cdef CUdevice cuDevice              #@DuplicatedSignature
    1159     cuda_devices = get_cuda_devices()
    1160     if len(cuda_devices)==0:
     1072    global DEFAULT_CUDA_DEVICE_ID
     1073    devices = get_cuda_devices()
     1074    if len(devices)==0:
    11611075        raise ImportError("no CUDA devices found!")
    1162     assert DEFAULT_CUDA_DEVICE_ID in cuda_devices.keys(), "specified CUDA device ID %s not found in %s" % (DEFAULT_CUDA_DEVICE_ID, cuda_devices)
    1163     cuDevice = get_cuda_device(DEFAULT_CUDA_DEVICE_ID)
     1076    assert DEFAULT_CUDA_DEVICE_ID in cuda_devices.keys(), "specified CUDA device ID %s not found in %s" % (DEFAULT_CUDA_DEVICE_ID, devices)
    11641077
    1165     raiseCUDA(cuCtxCreate(&context, 0, cuDevice), "creating CUDA context")
    1166     raiseCUDA(cuCtxPopCurrent(&context), "popping current context")
    1167     raiseCUDA(cuCtxDestroy(context), "destroying current context")
     1078    #create context for testing:
     1079    from pycuda import driver
     1080    d = driver.Device(DEFAULT_CUDA_DEVICE_ID)
     1081    context = d.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST)
     1082    debug("cuda_check created test context, api_version=%s", context.get_api_version())
     1083    context.pop()
     1084    context.detach()
    11681085
    11691086
    11701087cdef nvencStatusInfo(NVENCSTATUS ret):
     
    11771094        raise Exception("%s - returned %s" % (msg, nvencStatusInfo(ret)))
    11781095
    11791096
     1097#BGRA2NV12_functions = {}
     1098def get_BGRA2NV12():
     1099    #global BGRA2NV12_function
     1100    #if BGRA2NV12_function:
     1101    #    return  BGRA2NV12_function
     1102    from xpra.codecs.nvenc.CUDA_rgb2nv12 import BGRA2NV12_kernel
     1103    from pycuda.compiler import SourceModule
     1104    mod = SourceModule(BGRA2NV12_kernel)
     1105    BGRA2NV12_function = mod.get_function("BGRA2NV12")
     1106    return BGRA2NV12_function
     1107
     1108#import pycuda.driver.PointerHolderBase
     1109#class CustomCUDABuffer(pycuda.driver.PointerHolderBase):
     1110#    def __init__(self, pointer):
     1111#        self.pointer = pointer
     1112#    def get_pointer(self):
     1113#        return self.pointer
     1114
    11801115cdef class Encoder:
    11811116    cdef int width
    11821117    cdef int height
    11831118    cdef int encoder_width
    11841119    cdef int encoder_height
    11851120    cdef object src_format
    1186     cdef CUcontext cuda_context
     1121    #PyCUDA:
     1122    cdef object driver
     1123    cdef object cuda_device
     1124    cdef object cuda_context
     1125    cdef object BGRA2NV12
     1126    #NVENC:
    11871127    cdef NV_ENCODE_API_FUNCTION_LIST functionList               #@DuplicatedSignature
    11881128    cdef void *context
    11891129    cdef NV_ENC_REGISTERED_PTR inputHandle
    1190     cdef CUdeviceptr cudaBuffer
    1191     cdef void *inputBuffer
    1192     cdef size_t pitch
     1130    cdef object inputBuffer
     1131    cdef object cudaInputBuffer
     1132    cdef object cudaNV12Buffer
     1133    cdef int inputPitch
     1134    cdef int NV12Pitch
    11931135    cdef void *bitstreamBuffer
    11941136    cdef NV_ENC_BUFFER_FORMAT bufferFmt
    11951137    cdef object codec_name
     
    12261168        self.codec_name = "H264"
    12271169        self.preset_name = None
    12281170        self.frames = 0
    1229         self.cuda_context = NULL
     1171        self.cuda_device = None
     1172        self.cuda_context = None
    12301173        start = time.time()
    12311174
    12321175        device_id = options.get("cuda_device", DEFAULT_CUDA_DEVICE_ID)
     
    12371180
    12381181    def init_cuda(self, device_id):
    12391182        assert device_id in get_cuda_devices().keys(), "invalid device_id '%s' (available: %s)" % (device_id, cuda_devices)
    1240         cdef CUdevice cuda_device              #@DuplicatedSignature
    1241         cuda_device = get_cuda_device(DEFAULT_CUDA_DEVICE_ID)
    1242         raiseCUDA(cuCtxCreate(&self.cuda_context, 0, cuda_device))
    1243         debug("cuCtxCreate: device_id=%s, cuda_device=%s, cuda_context=%s", device_id, cuda_device, hex(<long> self.cuda_context))
    1244         #allocate CUDA input buffer (on device):
    1245         raiseCUDA(cuMemAllocPitch(&self.cudaBuffer, &self.pitch, self.encoder_width, self.encoder_height*3/2, 16), "allocating CUDA input buffer on device")
    1246         debug("cudaBuffer=%s, pitch=%s", hex(<long> self.cudaBuffer), self.pitch)
    1247         #allocate buffer on host:
    1248         raiseCUDA(cuMemAllocHost(&self.inputBuffer, self.pitch*self.encoder_height*3/2), "allocating CUDA input buffer on host")
    1249         debug("inputBuffer=%s", hex(<long> self.inputBuffer))
     1183        from pycuda import driver
     1184        self.driver = driver
     1185        debug("init_cuda(%s)", device_id)
     1186        self.cuda_device = driver.Device(DEFAULT_CUDA_DEVICE_ID)
     1187        self.cuda_context = self.cuda_device.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST)
     1188        try:
     1189            debug("init_cuda(%s) cuda_device=%s, cuda_context=%s", device_id, self.cuda_device, self.cuda_context)
     1190            #compile/get kernel:
     1191            self.BGRA2NV12 = get_BGRA2NV12()
     1192            #allocate CUDA input buffer (on device):
     1193            self.cudaInputBuffer, self.inputPitch = driver.mem_alloc_pitch(self.encoder_width, self.encoder_height*3/2, 16)
     1194            debug("CUDA Input Buffer=%s, pitch=%s", hex(int(self.cudaInputBuffer)), self.inputPitch)
     1195            #allocate CUDA NV12 buffer (on device):
     1196            self.cudaNV12Buffer, self.NV12Pitch = driver.mem_alloc_pitch(self.encoder_width, self.encoder_height*3/2, 16)
     1197            debug("CUDA NV12 Buffer=%s, pitch=%s", hex(int(self.cudaNV12Buffer)), self.NV12Pitch)
     1198            #allocate input buffer on host:
     1199            #self.inputBuffer = driver.pagelocked_empty(self.encoder_width*self.encoder_height, dtype=numpy.byte)
     1200            self.inputBuffer = driver.pagelocked_zeros(self.encoder_width*self.encoder_height, dtype=numpy.byte)
     1201            debug("inputBuffer=%s", self.inputBuffer)
     1202   
     1203            self.init_nvenc()
     1204        finally:
     1205            self.cuda_context.pop()
    12501206
    1251         self.init_nvenc()
    1252 
    1253         raiseCUDA(cuCtxPopCurrent(&self.cuda_context), "failed to pop context")
    1254 
    12551207    def init_nvenc(self):
    12561208        cdef GUID codec
    12571209        cdef GUID preset
     
    12591211        cdef NV_ENC_INITIALIZE_PARAMS params
    12601212        cdef NV_ENC_PRESET_CONFIG *presetConfig     #@DuplicatedSignature
    12611213        cdef NV_ENC_REGISTER_RESOURCE registerResource
     1214        cdef NV_ENC_CREATE_INPUT_BUFFER createInputBufferParams
     1215        cdef NV_ENC_CREATE_BITSTREAM_BUFFER createBitstreamBufferParams
     1216        cdef long resource
    12621217
    12631218        self.open_encode_session()
    12641219        codec = self.get_codec()
     
    12681223        input_format = BUFFER_FORMAT[self.bufferFmt]
    12691224        input_formats = self.query_input_formats(codec)
    12701225        assert input_format in input_formats, "%s does not support %s (only: %s)" %  (self.codec_name, input_format, input_formats)
    1271 
    1272         cdef NV_ENC_CREATE_INPUT_BUFFER createInputBufferParams
    1273         cdef NV_ENC_CREATE_BITSTREAM_BUFFER createBitstreamBufferParams
    12741226        try:
    12751227            presetConfig = self.get_preset_config(codec, preset)
    12761228
     
    12971249            memset(&registerResource, 0, sizeof(NV_ENC_REGISTER_RESOURCE))
    12981250            registerResource.version = NV_ENC_REGISTER_RESOURCE_VER
    12991251            registerResource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR
    1300             registerResource.resourceToRegister = <void *> self.cudaBuffer
     1252            resource = int(self.cudaNV12Buffer)
     1253            registerResource.resourceToRegister = <void *> resource
    13011254            registerResource.width = self.encoder_width
    13021255            registerResource.height = self.encoder_height
    1303             registerResource.pitch = self.pitch
     1256            registerResource.pitch = self.NV12Pitch
    13041257            raiseNVENC(self.functionList.nvEncRegisterResource(self.context, &registerResource), "registering CUDA input buffer")
    13051258            self.inputHandle = registerResource.registeredResource
    13061259            debug("input handle for CUDA buffer: %s", hex(<long> self.inputHandle))
     
    13371290
    13381291    def clean(self):                        #@DuplicatedSignature
    13391292        debug("clean() context=%s", hex(<long> self.context))
    1340         if self.cuda_context!=NULL:
    1341             raiseCUDA(cuCtxPushCurrent(self.cuda_context), "failed to push context")
     1293        if self.cuda_context:
     1294            self.cuda_context.push()
    13421295            try:
    13431296                self.cuda_clean()
    13441297            finally:
    1345                 raiseCUDA(cuCtxPopCurrent(&self.cuda_context), "failed to pop context")
    1346                 cuCtxDestroy(self.cuda_context)
    1347                 self.cuda_context = NULL
     1298                self.cuda_context.pop()
     1299                self.cuda_context.detach()
     1300                self.cuda_context = None
    13481301
    13491302    def cuda_clean(self):
    1350         if self.inputHandle!=NULL:
     1303        if self.inputHandle!=NULL and self.context!=NULL:
    13511304            debug("clean() unregistering %s", hex(<long> self.inputHandle))
    13521305            raiseNVENC(self.functionList.nvEncUnregisterResource(self.context, self.inputHandle), "unregistering CUDA input buffer")
    13531306            self.inputHandle = NULL
    1354         if self.inputBuffer!=NULL:
    1355             debug("clean() freeing CUDA host buffer %s", hex(<long> self.inputBuffer))
    1356             raiseCUDA(cuMemFreeHost(self.inputBuffer), "freeing host buffer")
    1357             self.inputBuffer = NULL
    1358         if (<void *> self.cudaBuffer)!=NULL:
    1359             debug("clean() freeing CUDA device buffer %s", hex(<long> self.cudaBuffer))
    1360             raiseCUDA(cuMemFree(self.cudaBuffer), "freeing CUDA buffer")
    1361             self.cudaBuffer = <CUdeviceptr> NULL
    1362         if self.bitstreamBuffer!=NULL:
    1363             debug("clean() destroying bitstream buffer %s", hex(<long> self.bitstreamBuffer))
    1364             raiseNVENC(self.functionList.nvEncDestroyBitstreamBuffer(self.context, self.bitstreamBuffer), "destroying output buffer")
    1365             self.bitstreamBuffer = NULL
    1366         raiseNVENC(self.functionList.nvEncDestroyEncoder(self.context), "destroying context")
     1307        if self.inputBuffer:
     1308            debug("clean() freeing CUDA host buffer %s", self.inputBuffer)
     1309            self.inputBuffer = None
     1310        if self.cudaInputBuffer:
     1311            debug("clean() freeing CUDA input buffer %s", hex(int(self.cudaInputBuffer)))
     1312            self.cudaInputBuffer.free()
     1313            self.cudaInputBuffer = None
     1314        if self.cudaNV12Buffer:
     1315            debug("clean() freeing CUDA NV12 buffer %s", hex(int(self.cudaNV12Buffer)))
     1316            self.cudaNV12Buffer.free()
     1317            self.cudaNV12Buffer = None
     1318        if self.context!=NULL:
     1319            if self.bitstreamBuffer!=NULL:
     1320                debug("clean() destroying bitstream buffer %s", hex(<long> self.bitstreamBuffer))
     1321                raiseNVENC(self.functionList.nvEncDestroyBitstreamBuffer(self.context, self.bitstreamBuffer), "destroying output buffer")
     1322                self.bitstreamBuffer = NULL
     1323            debug("clean() destroying encoder %s", hex(<long> self.context))
     1324            raiseNVENC(self.functionList.nvEncDestroyEncoder(self.context), "destroying context")
    13671325
    13681326    def get_width(self):
    13691327        return self.width
     
    13971355        self.functionList.nvEncEncodePicture(self.context, &picParams)
    13981356
    13991357    def compress_image(self, image, options={}):
    1400         raiseCUDA(cuCtxPushCurrent(self.cuda_context), "failed to push context")
     1358        self.cuda_context.push()
    14011359        try:
    14021360            return self.do_compress_image(image, options)
    14031361        finally:
    1404             raiseCUDA(cuCtxPopCurrent(&self.cuda_context), "failed to pop context")
     1362            self.cuda_context.pop()
    14051363
    14061364    def do_compress_image(self, image, options={}):
    14071365        cdef const void* Y = NULL
     
    14311389        debug("compress_image(..) pixels=%s", type(pixels))
    14321390
    14331391        #copy to input buffer:
    1434         size = self.pitch * self.encoder_height * 3/2
    1435         memset(self.inputBuffer, 0, size)
     1392        size = self.inputPitch * self.encoder_height * 3/2
    14361393        #copy luma:
    1437         assert PyObject_AsReadBuffer(pixels[0], &Y, &Y_len)==0
    1438         assert PyObject_AsReadBuffer(pixels[1], &Cb, &Cb_len)==0
    1439         assert PyObject_AsReadBuffer(pixels[2], &Cr, &Cr_len)==0
    14401394        stride = strides[0]
     1395        Yplane = pixels[0]
     1396        buffer = self.inputBuffer.buffer
    14411397        for y in range(h):
    1442             memcpy(self.inputBuffer + y*self.pitch, Y + stride*y, w)
     1398            dst = y * self.inputPitch
     1399            src = y * stride
     1400            buffer[dst:(dst+w)] = Yplane[src:(src+w)]
    14431401        #copy chroma packed:
    14441402        assert strides[1]==strides[2], "U and V strides differ: %s vs %s" % (strides[1], strides[2])
    14451403        stride = strides[1]
     1404        Uplane = pixels[1]
     1405        Vplane = pixels[2]
    14461406        for y in range(h/2):
    1447             offset = (self.encoder_height + y) * self.pitch
     1407            offset = (self.encoder_height + y) * self.inputPitch
    14481408            for x in range(w/2):
    1449                 (<char*> self.inputBuffer)[offset + (x*2)] = (<char *> Cb)[stride*y + x]
    1450                 (<char*> self.inputBuffer)[offset + (x*2)+1] = (<char *> Cr)[stride*y + x]
     1409                buffer[offset + (x*2)] = Uplane[stride*y + x]
     1410                buffer[offset + (x*2) + 1] = Vplane[stride*y + x]
    14511411
    14521412        #copy input buffer to CUDA buffer:
    1453         raiseCUDA(cuMemcpyHtoD(self.cudaBuffer, self.inputBuffer, size), "copy from host to device")
     1413        self.driver.memcpy_htod(self.cudaInputBuffer, self.inputBuffer)
     1414        #FIXME: just for testing:
     1415        #raiseCUDA(cuMemcpyHtoD(self.cudaNV12Buffer, self.inputBuffer, size), "TEMPORARY")
     1416        #FIXME: clear pitch padding pixels?
     1417        self.BGRA2NV12(self.cudaInputBuffer, self.inputPitch, self.cudaNV12Buffer, self.NV12Pitch, self.width, self.height)
    14541418        debug("compress_image(..) input buffer copied to device")
    14551419
    14561420        #map buffer so nvenc can access it:
    14571421        memset(&mapInputResource, 0, sizeof(NV_ENC_MAP_INPUT_RESOURCE))
    14581422        mapInputResource.version = NV_ENC_MAP_INPUT_RESOURCE_VER
    14591423        mapInputResource.registeredResource  = self.inputHandle
    1460         raiseCUDA(self.functionList.nvEncMapInputResource(self.context, &mapInputResource), "mapping input resource")
     1424        raiseNVENC(self.functionList.nvEncMapInputResource(self.context, &mapInputResource), "mapping input resource")
    14611425        debug("compress_image(..) device buffer mapped to %s", hex(<long> mapInputResource.mappedResource))
    14621426
    14631427        try:
     
    14671431            picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME
    14681432            picParams.inputWidth = self.encoder_width
    14691433            picParams.inputHeight = self.encoder_height
    1470             picParams.inputPitch = self.pitch
     1434            picParams.inputPitch = self.NV12Pitch
    14711435            picParams.inputBuffer = mapInputResource.mappedResource
    14721436            picParams.outputBitstream = self.bitstreamBuffer
    14731437            #picParams.pictureType: required when enablePTD is disabled
     
    15031467            pixels = (<char *> lockOutputBuffer.bitstreamBufferPtr)[:size]
    15041468        finally:
    15051469            raiseNVENC(self.functionList.nvEncUnlockBitstream(self.context, self.bitstreamBuffer), "unlocking output buffer")
    1506             raiseCUDA(self.functionList.nvEncUnmapInputResource(self.context, mapInputResource.mappedResource), "unmapping input resource")
     1470            raiseNVENC(self.functionList.nvEncUnmapInputResource(self.context, mapInputResource.mappedResource), "unmapping input resource")
    15071471
    15081472        end = time.time()
    15091473        self.frames += 1
     
    16621626
    16631627    cdef open_encode_session(self):
    16641628        cdef NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS params
    1665         debug("open_encode_session(%s)", hex(<long> self.cuda_context))
     1629        debug("open_encode_session() cuda_context=%s", self.cuda_context)
     1630        debug("open_encode_session() cuda_context=%s", dir(self.cuda_context))
    16661631
    16671632        #get NVENC function pointers:
    16681633        memset(&self.functionList, 0, sizeof(NV_ENCODE_API_FUNCTION_LIST))