Ticket #370: nvenc-dualbuffers.patch
File nvenc-dualbuffers.patch, 7.6 KB (added by , 8 years ago) |
---|
-
xpra/codecs/nvenc/encoder.pyx
1187 1187 cdef NV_ENCODE_API_FUNCTION_LIST functionList #@DuplicatedSignature 1188 1188 cdef void *context 1189 1189 cdef NV_ENC_REGISTERED_PTR inputHandle 1190 cdef CUdeviceptr cudaBuffer 1190 cdef CUdeviceptr cudaInputBuffer 1191 cdef CUdeviceptr cudaNV12Buffer 1192 cdef size_t inputPitch 1193 cdef size_t NV12Pitch 1191 1194 cdef void *inputBuffer 1192 cdef size_t pitch1193 1195 cdef void *bitstreamBuffer 1194 1196 cdef NV_ENC_BUFFER_FORMAT bufferFmt 1195 1197 cdef object codec_name … … 1242 1244 raiseCUDA(cuCtxCreate(&self.cuda_context, 0, cuda_device)) 1243 1245 debug("cuCtxCreate: device_id=%s, cuda_device=%s, cuda_context=%s", device_id, cuda_device, hex(<long> self.cuda_context)) 1244 1246 #allocate CUDA input buffer (on device): 1245 raiseCUDA(cuMemAllocPitch(&self.cudaBuffer, &self.pitch, self.encoder_width, self.encoder_height*3/2, 16), "allocating CUDA input buffer on device") 1246 debug("cudaBuffer=%s, pitch=%s", hex(<long> self.cudaBuffer), self.pitch) 1247 raiseCUDA(cuMemAllocPitch(&self.cudaInputBuffer, &self.inputPitch, self.encoder_width, self.encoder_height*3/2, 16), "allocating CUDA input buffer on device") 1248 debug("CUDA Input Buffer=%s, pitch=%s", hex(<long> self.cudaInputBuffer), self.inputPitch) 1249 #allocate CUDA NV12 buffer (on device): 1250 raiseCUDA(cuMemAllocPitch(&self.cudaNV12Buffer, &self.NV12Pitch, self.encoder_width, self.encoder_height*3/2, 16), "allocating CUDA NV12 buffer on device") 1251 debug("CUDA NV12 Buffer=%s, pitch=%s", hex(<long> self.cudaNV12Buffer), self.NV12Pitch) 1247 1252 #allocate buffer on host: 1248 raiseCUDA(cuMemAllocHost(&self.inputBuffer, self. pitch*self.encoder_height*3/2), "allocating CUDA input buffer on host")1253 raiseCUDA(cuMemAllocHost(&self.inputBuffer, self.inputPitch*self.encoder_height*3/2), "allocating CUDA input buffer on host") 1249 1254 debug("inputBuffer=%s", hex(<long> self.inputBuffer)) 1250 1255 1251 1256 self.init_nvenc() … … 1297 1302 memset(®isterResource, 0, sizeof(NV_ENC_REGISTER_RESOURCE)) 1298 1303 registerResource.version = NV_ENC_REGISTER_RESOURCE_VER 1299 1304 registerResource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR 1300 registerResource.resourceToRegister = <void *> self.cuda Buffer1305 registerResource.resourceToRegister = <void *> self.cudaNV12Buffer 1301 1306 registerResource.width = self.encoder_width 1302 1307 registerResource.height = self.encoder_height 1303 registerResource.pitch = self. pitch1308 registerResource.pitch = self.NV12Pitch 1304 1309 raiseNVENC(self.functionList.nvEncRegisterResource(self.context, ®isterResource), "registering CUDA input buffer") 1305 1310 self.inputHandle = registerResource.registeredResource 1306 1311 debug("input handle for CUDA buffer: %s", hex(<long> self.inputHandle)) … … 1347 1352 self.cuda_context = NULL 1348 1353 1349 1354 def cuda_clean(self): 1350 if self.inputHandle!=NULL :1355 if self.inputHandle!=NULL and self.context!=NULL: 1351 1356 debug("clean() unregistering %s", hex(<long> self.inputHandle)) 1352 1357 raiseNVENC(self.functionList.nvEncUnregisterResource(self.context, self.inputHandle), "unregistering CUDA input buffer") 1353 1358 self.inputHandle = NULL … … 1355 1360 debug("clean() freeing CUDA host buffer %s", hex(<long> self.inputBuffer)) 1356 1361 raiseCUDA(cuMemFreeHost(self.inputBuffer), "freeing host buffer") 1357 1362 self.inputBuffer = NULL 1358 if (<void *> self.cudaBuffer)!=NULL: 1359 debug("clean() freeing CUDA device buffer %s", hex(<long> self.cudaBuffer)) 1360 raiseCUDA(cuMemFree(self.cudaBuffer), "freeing CUDA buffer") 1361 self.cudaBuffer = <CUdeviceptr> NULL 1362 if self.bitstreamBuffer!=NULL: 1363 debug("clean() destroying bitstream buffer %s", hex(<long> self.bitstreamBuffer)) 1364 raiseNVENC(self.functionList.nvEncDestroyBitstreamBuffer(self.context, self.bitstreamBuffer), "destroying output buffer") 1365 self.bitstreamBuffer = NULL 1366 raiseNVENC(self.functionList.nvEncDestroyEncoder(self.context), "destroying context") 1363 if (<void *> self.cudaInputBuffer)!=NULL: 1364 debug("clean() freeing CUDA input buffer %s", hex(<long> self.cudaInputBuffer)) 1365 raiseCUDA(cuMemFree(self.cudaInputBuffer), "freeing CUDA input buffer") 1366 self.cudaInputBuffer = <CUdeviceptr> NULL 1367 if (<void *> self.cudaNV12Buffer)!=NULL: 1368 debug("clean() freeing CUDA NV12 buffer %s", hex(<long> self.cudaNV12Buffer)) 1369 raiseCUDA(cuMemFree(self.cudaNV12Buffer), "freeing CUDA NV12 buffer") 1370 self.cudaNV12Buffer = <CUdeviceptr> NULL 1371 if self.context!=NULL: 1372 if self.bitstreamBuffer!=NULL: 1373 debug("clean() destroying bitstream buffer %s", hex(<long> self.bitstreamBuffer)) 1374 raiseNVENC(self.functionList.nvEncDestroyBitstreamBuffer(self.context, self.bitstreamBuffer), "destroying output buffer") 1375 self.bitstreamBuffer = NULL 1376 debug("clean() destroying encoder %s", hex(<long> self.context)) 1377 raiseNVENC(self.functionList.nvEncDestroyEncoder(self.context), "destroying context") 1367 1378 1368 1379 def get_width(self): 1369 1380 return self.width … … 1431 1442 debug("compress_image(..) pixels=%s", type(pixels)) 1432 1443 1433 1444 #copy to input buffer: 1434 size = self. pitch * self.encoder_height * 3/21445 size = self.inputPitch * self.encoder_height * 3/2 1435 1446 memset(self.inputBuffer, 0, size) 1436 1447 #copy luma: 1437 1448 assert PyObject_AsReadBuffer(pixels[0], &Y, &Y_len)==0 … … 1439 1450 assert PyObject_AsReadBuffer(pixels[2], &Cr, &Cr_len)==0 1440 1451 stride = strides[0] 1441 1452 for y in range(h): 1442 memcpy(self.inputBuffer + y*self. pitch, Y + stride*y, w)1453 memcpy(self.inputBuffer + y*self.inputPitch, Y + stride*y, w) 1443 1454 #copy chroma packed: 1444 1455 assert strides[1]==strides[2], "U and V strides differ: %s vs %s" % (strides[1], strides[2]) 1445 1456 stride = strides[1] 1446 1457 for y in range(h/2): 1447 offset = (self.encoder_height + y) * self. pitch1458 offset = (self.encoder_height + y) * self.inputPitch 1448 1459 for x in range(w/2): 1449 1460 (<char*> self.inputBuffer)[offset + (x*2)] = (<char *> Cb)[stride*y + x] 1450 1461 (<char*> self.inputBuffer)[offset + (x*2)+1] = (<char *> Cr)[stride*y + x] 1451 1462 1452 1463 #copy input buffer to CUDA buffer: 1453 raiseCUDA(cuMemcpyHtoD(self.cudaBuffer, self.inputBuffer, size), "copy from host to device") 1464 raiseCUDA(cuMemcpyHtoD(self.cudaInputBuffer, self.inputBuffer, size), "copy from host to device input buffer") 1465 #FIXME: just for testing: 1466 raiseCUDA(cuMemcpyHtoD(self.cudaNV12Buffer, self.inputBuffer, size), "TEMPORARY") 1454 1467 debug("compress_image(..) input buffer copied to device") 1455 1468 1456 1469 #map buffer so nvenc can access it: … … 1467 1480 picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME 1468 1481 picParams.inputWidth = self.encoder_width 1469 1482 picParams.inputHeight = self.encoder_height 1470 picParams.inputPitch = self. pitch1483 picParams.inputPitch = self.NV12Pitch 1471 1484 picParams.inputBuffer = mapInputResource.mappedResource 1472 1485 picParams.outputBitstream = self.bitstreamBuffer 1473 1486 #picParams.pictureType: required when enablePTD is disabled