Ticket #558: nvenc-hacked-win32.patch
File nvenc-hacked-win32.patch, 7.5 KB (added by , 7 years ago) |
---|
-
xpra/codecs/cuda_common/BGRA_to_NV12.cu
7 7 8 8 #include <stdint.h> 9 9 10 extern "C" __global__ void BGRA_to_NV12(uint8_t *srcImage, int src_w, int src_h, int srcPitch, 11 uint8_t *dstImage, int dst_w, int dst_h, int dstPitch, 10 extern "C" __global__ void BGRA_to_NV12(uint8_t *srcImage, int src_w, int src_h, int srcHPitch, 11 uint8_t *dstImage, int dst_w, int dst_h, 12 int dstHPitch, int dstVPitch, 12 13 int w, int h) 13 14 { 14 15 uint32_t gx, gy; … … 21 22 if ((src_x < w) & (src_y < h)) { 22 23 //4 bytes per pixel, and 2 pixels width/height at a time: 23 24 //byte index: 24 uint32_t si = (src_y * src Pitch) + src_x * 4;25 uint32_t si = (src_y * srcHPitch) + src_x * 4; 25 26 26 27 //we may read up to 4 32-bit RGB pixels: 27 28 uint8_t R[4]; … … 38 39 } 39 40 40 41 //write up to 4 Y pixels: 41 uint32_t di = (gy * 2 * dst Pitch) + gx * 2;42 uint32_t di = (gy * 2 * dstHPitch) + gx * 2; 42 43 dstImage[di] = __float2int_rn(0.257 * R[0] + 0.504 * G[0] + 0.098 * B[0] + 16); 43 44 if (gx*2 + 1 < src_w) { 44 45 R[1] = srcImage[si+6]; … … 47 48 dstImage[di + 1] = __float2int_rn(0.257 * R[1] + 0.504 * G[1] + 0.098 * B[1] + 16); 48 49 } 49 50 if (gy*2 + 1 < src_h) { 50 si += src Pitch;51 di += dst Pitch;51 si += srcHPitch; 52 di += dstHPitch; 52 53 R[2] = srcImage[si+2]; 53 54 G[2] = srcImage[si+1]; 54 55 B[2] = srcImage[si]; … … 68 69 u += -0.148 * R[j] - 0.291 * G[j] + 0.439 * B[j] + 128; 69 70 v += 0.439 * R[j] - 0.368 * G[j] - 0.071 * B[j] + 128; 70 71 } 71 di = (dst _h + gy) * dstPitch + gx * 2;72 di = (dstVPitch + gy) * dstHPitch + gx * 2; 72 73 dstImage[di] = __float2int_rn(u / 4.0); 73 74 dstImage[di + 1] = __float2int_rn(v / 4.0); 74 75 } -
xpra/codecs/cuda_common/BGRA_to_YUV444.cu
7 7 8 8 #include <stdint.h> 9 9 10 extern "C" __global__ void BGRA_to_YUV444(uint8_t *srcImage, int src_w, int src_h, int srcPitch, 11 uint8_t *dstImage, int dst_w, int dst_h, int dstPitch, 12 int w, int h) 10 extern "C" __global__ void BGRA_to_YUV444(uint8_t *srcImage, int src_w, int src_h, int srcHPitch, 11 uint8_t *dstImage, int dst_w, int dst_h, 12 int dstHPitch, int dstVPitch, 13 int w, int h) 13 14 { 14 15 uint32_t gx, gy; 15 16 gx = blockIdx.x * blockDim.x + threadIdx.x; … … 23 24 uint8_t G; 24 25 uint8_t B; 25 26 //one 32-bit RGB pixel at a time: 26 uint32_t si = (src_y * src Pitch) + src_x * 4;27 uint32_t si = (src_y * srcHPitch) + src_x * 4; 27 28 R = srcImage[si+2]; 28 29 G = srcImage[si+1]; 29 30 B = srcImage[si]; 30 31 31 32 uint32_t di; 32 di = (gy * dst Pitch) + gx;33 di = (gy * dstHPitch) + gx; 33 34 dstImage[di] = __float2int_rn(0.257 * R + 0.504 * G + 0.098 * B + 16); 34 di += dst Pitch*dst_h;35 di += dstHPitch*dstVPitch; 35 36 dstImage[di] = __float2int_rn(-0.148 * R - 0.291 * G + 0.439 * B + 128); 36 di += dst Pitch*dst_h;37 di += dstHPitch*dstVPitch; 37 38 dstImage[di] = __float2int_rn(0.439 * R - 0.368 * G - 0.071 * B + 128); 38 39 } 39 40 } -
xpra/codecs/nvenc4/encoder.pyx
40 40 cdef int YUV444_THRESHOLD = int(os.environ.get("XPRA_NVENC_YUV444_THRESHOLD", "85")) 41 41 cdef int LOSSLESS_THRESHOLD = int(os.environ.get("XPRA_NVENC_LOSSLESS_THRESHOLD", "100")) 42 42 cdef int DEBUG_API = int(os.environ.get("XPRA_NVENC_DEBUG_API", "0")=="1") 43 cdef int VADJUST = int(os.environ.get("XPRA_NVENC_VADJUST", "0")) 44 cdef int VPAD = int(os.environ.get("XPRA_NVENC_VPAD", "0")) 45 cdef int VROUND = int(os.environ.get("XPRA_NVENC_VROUND", "0")) 43 46 44 47 cdef int QP_MAX_VALUE = 51 #newer versions of ffmpeg can decode up to 63 45 48 … … 1295 1298 self.quality = quality 1296 1299 self.scaling = scaling or (1, 1) 1297 1300 v, u = self.scaling 1298 self.input_width = roundup(width, 32)1299 self.input_height = roundup(height , 32)1300 self.encoder_width = roundup(width*v//u, 32)1301 self.encoder_height = roundup( height*v//u, 32)1301 self.input_width = roundup(width, 64) 1302 self.input_height = roundup(height+VPAD, VROUND) 1303 self.encoder_width = roundup(width*v//u, 64) 1304 self.encoder_height = roundup((height+VPAD)*v//u, VROUND) 1302 1305 self.src_format = src_format 1303 1306 self.dst_formats = dst_formats 1304 1307 self.codec_name = "H264" … … 1401 1404 self.cudaInputBuffer, self.inputPitch = driver.mem_alloc_pitch(max_input_stride, self.input_height, 16) 1402 1405 log("CUDA Input Buffer=%#x, pitch=%s", int(self.cudaInputBuffer), self.inputPitch) 1403 1406 #allocate CUDA output buffer (on device): 1404 self.cudaOutputBuffer, self.outputPitch = driver.mem_alloc_pitch(self.encoder_width, self.encoder_height*3//plane_size_div, 16)1407 self.cudaOutputBuffer, self.outputPitch = driver.mem_alloc_pitch(self.encoder_width, roundup(self.encoder_height+MAX(0, VADJUST), 64)*3//plane_size_div, 16) 1405 1408 log("CUDA Output Buffer=%#x, pitch=%s", int(self.cudaOutputBuffer), self.outputPitch) 1406 1409 #allocate input buffer on host: 1407 1410 self.inputBuffer = driver.pagelocked_zeros(self.inputPitch*self.input_height, dtype=numpy.byte) … … 1936 1939 1937 1940 csc_start = time.time() 1938 1941 args = (self.cudaInputBuffer, numpy.int32(in_w), numpy.int32(in_h), numpy.int32(stride), 1939 self.cudaOutputBuffer, numpy.int32(self.encoder_width), numpy.int32(self.encoder_height), numpy.int32(self.outputPitch), 1942 self.cudaOutputBuffer, numpy.int32(self.encoder_width), numpy.int32(self.encoder_height), 1943 numpy.int32(self.outputPitch), numpy.int32(MAX(1, self.encoder_height+VADJUST)), 1940 1944 numpy.int32(w), numpy.int32(h)) 1941 log ("calling %s%s with block=%s, grid=%s", self.kernel, args, (blockw,blockh,1), (gridw, gridh))1945 log.info("calling %s%s with block=%s, grid=%s (VADJUST=%s)", self.kernel, args, (blockw,blockh,1), (gridw, gridh), VADJUST) 1942 1946 self.kernel(*args, block=(blockw,blockh,1), grid=(gridw, gridh)) 1943 1947 csc_end = time.time() 1944 1948 log("compress_image(..) kernel %s executed - CSC took %.1f ms", self.kernel_name, (csc_end - csc_start)*1000.0) … … 2001 2005 r = self.functionList.nvEncEncodePicture(self.context, &picParams) 2002 2006 raiseNVENC(r, "error during picture encoding") 2003 2007 encode_end = time.time() 2004 log ("compress_image(..) encoded in %.1f ms, info=%s", (encode_end-csc_end)*1000.0, self.get_info())2008 log.info("compress_image(..) encoded in %.1f ms, info=%s", (encode_end-csc_end)*1000.0, self.get_info()) 2005 2009 2006 2010 #lock output buffer: 2007 2011 memset(&lockOutputBuffer, 0, sizeof(NV_ENC_LOCK_BITSTREAM))