xpra icon
Bug tracker and wiki

This bug tracker and wiki are being discontinued
please use https://github.com/Xpra-org/xpra instead.


Ticket #370: nvenc-dualbuffers.patch

File nvenc-dualbuffers.patch, 7.6 KB (added by Antoine Martin, 8 years ago)

use two buffers CUDA side so we can use a kernel to copy (and convert) from one to the other

  • xpra/codecs/nvenc/encoder.pyx

     
    11871187    cdef NV_ENCODE_API_FUNCTION_LIST functionList               #@DuplicatedSignature
    11881188    cdef void *context
    11891189    cdef NV_ENC_REGISTERED_PTR inputHandle
    1190     cdef CUdeviceptr cudaBuffer
     1190    cdef CUdeviceptr cudaInputBuffer
     1191    cdef CUdeviceptr cudaNV12Buffer
     1192    cdef size_t inputPitch
     1193    cdef size_t NV12Pitch
    11911194    cdef void *inputBuffer
    1192     cdef size_t pitch
    11931195    cdef void *bitstreamBuffer
    11941196    cdef NV_ENC_BUFFER_FORMAT bufferFmt
    11951197    cdef object codec_name
     
    12421244        raiseCUDA(cuCtxCreate(&self.cuda_context, 0, cuda_device))
    12431245        debug("cuCtxCreate: device_id=%s, cuda_device=%s, cuda_context=%s", device_id, cuda_device, hex(<long> self.cuda_context))
    12441246        #allocate CUDA input buffer (on device):
    1245         raiseCUDA(cuMemAllocPitch(&self.cudaBuffer, &self.pitch, self.encoder_width, self.encoder_height*3/2, 16), "allocating CUDA input buffer on device")
    1246         debug("cudaBuffer=%s, pitch=%s", hex(<long> self.cudaBuffer), self.pitch)
     1247        raiseCUDA(cuMemAllocPitch(&self.cudaInputBuffer, &self.inputPitch, self.encoder_width, self.encoder_height*3/2, 16), "allocating CUDA input buffer on device")
     1248        debug("CUDA Input Buffer=%s, pitch=%s", hex(<long> self.cudaInputBuffer), self.inputPitch)
     1249        #allocate CUDA NV12 buffer (on device):
     1250        raiseCUDA(cuMemAllocPitch(&self.cudaNV12Buffer, &self.NV12Pitch, self.encoder_width, self.encoder_height*3/2, 16), "allocating CUDA NV12 buffer on device")
     1251        debug("CUDA NV12 Buffer=%s, pitch=%s", hex(<long> self.cudaNV12Buffer), self.NV12Pitch)
    12471252        #allocate buffer on host:
    1248         raiseCUDA(cuMemAllocHost(&self.inputBuffer, self.pitch*self.encoder_height*3/2), "allocating CUDA input buffer on host")
     1253        raiseCUDA(cuMemAllocHost(&self.inputBuffer, self.inputPitch*self.encoder_height*3/2), "allocating CUDA input buffer on host")
    12491254        debug("inputBuffer=%s", hex(<long> self.inputBuffer))
    12501255
    12511256        self.init_nvenc()
     
    12971302            memset(&registerResource, 0, sizeof(NV_ENC_REGISTER_RESOURCE))
    12981303            registerResource.version = NV_ENC_REGISTER_RESOURCE_VER
    12991304            registerResource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR
    1300             registerResource.resourceToRegister = <void *> self.cudaBuffer
     1305            registerResource.resourceToRegister = <void *> self.cudaNV12Buffer
    13011306            registerResource.width = self.encoder_width
    13021307            registerResource.height = self.encoder_height
    1303             registerResource.pitch = self.pitch
     1308            registerResource.pitch = self.NV12Pitch
    13041309            raiseNVENC(self.functionList.nvEncRegisterResource(self.context, &registerResource), "registering CUDA input buffer")
    13051310            self.inputHandle = registerResource.registeredResource
    13061311            debug("input handle for CUDA buffer: %s", hex(<long> self.inputHandle))
     
    13471352                self.cuda_context = NULL
    13481353
    13491354    def cuda_clean(self):
    1350         if self.inputHandle!=NULL:
     1355        if self.inputHandle!=NULL and self.context!=NULL:
    13511356            debug("clean() unregistering %s", hex(<long> self.inputHandle))
    13521357            raiseNVENC(self.functionList.nvEncUnregisterResource(self.context, self.inputHandle), "unregistering CUDA input buffer")
    13531358            self.inputHandle = NULL
     
    13551360            debug("clean() freeing CUDA host buffer %s", hex(<long> self.inputBuffer))
    13561361            raiseCUDA(cuMemFreeHost(self.inputBuffer), "freeing host buffer")
    13571362            self.inputBuffer = NULL
    1358         if (<void *> self.cudaBuffer)!=NULL:
    1359             debug("clean() freeing CUDA device buffer %s", hex(<long> self.cudaBuffer))
    1360             raiseCUDA(cuMemFree(self.cudaBuffer), "freeing CUDA buffer")
    1361             self.cudaBuffer = <CUdeviceptr> NULL
    1362         if self.bitstreamBuffer!=NULL:
    1363             debug("clean() destroying bitstream buffer %s", hex(<long> self.bitstreamBuffer))
    1364             raiseNVENC(self.functionList.nvEncDestroyBitstreamBuffer(self.context, self.bitstreamBuffer), "destroying output buffer")
    1365             self.bitstreamBuffer = NULL
    1366         raiseNVENC(self.functionList.nvEncDestroyEncoder(self.context), "destroying context")
     1363        if (<void *> self.cudaInputBuffer)!=NULL:
     1364            debug("clean() freeing CUDA input buffer %s", hex(<long> self.cudaInputBuffer))
     1365            raiseCUDA(cuMemFree(self.cudaInputBuffer), "freeing CUDA input buffer")
     1366            self.cudaInputBuffer = <CUdeviceptr> NULL
     1367        if (<void *> self.cudaNV12Buffer)!=NULL:
     1368            debug("clean() freeing CUDA NV12 buffer %s", hex(<long> self.cudaNV12Buffer))
     1369            raiseCUDA(cuMemFree(self.cudaNV12Buffer), "freeing CUDA NV12 buffer")
     1370            self.cudaNV12Buffer = <CUdeviceptr> NULL
     1371        if self.context!=NULL:
     1372            if self.bitstreamBuffer!=NULL:
     1373                debug("clean() destroying bitstream buffer %s", hex(<long> self.bitstreamBuffer))
     1374                raiseNVENC(self.functionList.nvEncDestroyBitstreamBuffer(self.context, self.bitstreamBuffer), "destroying output buffer")
     1375                self.bitstreamBuffer = NULL
     1376            debug("clean() destroying encoder %s", hex(<long> self.context))
     1377            raiseNVENC(self.functionList.nvEncDestroyEncoder(self.context), "destroying context")
    13671378
    13681379    def get_width(self):
    13691380        return self.width
     
    14311442        debug("compress_image(..) pixels=%s", type(pixels))
    14321443
    14331444        #copy to input buffer:
    1434         size = self.pitch * self.encoder_height * 3/2
     1445        size = self.inputPitch * self.encoder_height * 3/2
    14351446        memset(self.inputBuffer, 0, size)
    14361447        #copy luma:
    14371448        assert PyObject_AsReadBuffer(pixels[0], &Y, &Y_len)==0
     
    14391450        assert PyObject_AsReadBuffer(pixels[2], &Cr, &Cr_len)==0
    14401451        stride = strides[0]
    14411452        for y in range(h):
    1442             memcpy(self.inputBuffer + y*self.pitch, Y + stride*y, w)
     1453            memcpy(self.inputBuffer + y*self.inputPitch, Y + stride*y, w)
    14431454        #copy chroma packed:
    14441455        assert strides[1]==strides[2], "U and V strides differ: %s vs %s" % (strides[1], strides[2])
    14451456        stride = strides[1]
    14461457        for y in range(h/2):
    1447             offset = (self.encoder_height + y) * self.pitch
     1458            offset = (self.encoder_height + y) * self.inputPitch
    14481459            for x in range(w/2):
    14491460                (<char*> self.inputBuffer)[offset + (x*2)] = (<char *> Cb)[stride*y + x]
    14501461                (<char*> self.inputBuffer)[offset + (x*2)+1] = (<char *> Cr)[stride*y + x]
    14511462
    14521463        #copy input buffer to CUDA buffer:
    1453         raiseCUDA(cuMemcpyHtoD(self.cudaBuffer, self.inputBuffer, size), "copy from host to device")
     1464        raiseCUDA(cuMemcpyHtoD(self.cudaInputBuffer, self.inputBuffer, size), "copy from host to device input buffer")
     1465        #FIXME: just for testing:
     1466        raiseCUDA(cuMemcpyHtoD(self.cudaNV12Buffer, self.inputBuffer, size), "TEMPORARY")
    14541467        debug("compress_image(..) input buffer copied to device")
    14551468
    14561469        #map buffer so nvenc can access it:
     
    14671480            picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME
    14681481            picParams.inputWidth = self.encoder_width
    14691482            picParams.inputHeight = self.encoder_height
    1470             picParams.inputPitch = self.pitch
     1483            picParams.inputPitch = self.NV12Pitch
    14711484            picParams.inputBuffer = mapInputResource.mappedResource
    14721485            picParams.outputBitstream = self.bitstreamBuffer
    14731486            #picParams.pictureType: required when enablePTD is disabled