xpra icon
Bug tracker and wiki

This bug tracker and wiki are being discontinued
please use https://github.com/Xpra-org/xpra instead.


Ticket #558: nvenc-hacked-win32.patch

File nvenc-hacked-win32.patch, 7.5 KB (added by Antoine Martin, 7 years ago)

more hacks to tweak input values to the kernel and encoder

  • xpra/codecs/cuda_common/BGRA_to_NV12.cu

     
    77
    88#include <stdint.h>
    99
    10 extern "C" __global__ void BGRA_to_NV12(uint8_t *srcImage, int src_w, int src_h, int srcPitch,
    11                           uint8_t *dstImage, int dst_w, int dst_h, int dstPitch,
     10extern "C" __global__ void BGRA_to_NV12(uint8_t *srcImage, int src_w, int src_h, int srcHPitch,
     11                          uint8_t *dstImage, int dst_w, int dst_h,
     12                          int dstHPitch, int dstVPitch,
    1213                          int w, int h)
    1314{
    1415    uint32_t gx, gy;
     
    2122    if ((src_x < w) & (src_y < h)) {
    2223        //4 bytes per pixel, and 2 pixels width/height at a time:
    2324        //byte index:
    24         uint32_t si = (src_y * srcPitch) + src_x * 4;
     25        uint32_t si = (src_y * srcHPitch) + src_x * 4;
    2526
    2627        //we may read up to 4 32-bit RGB pixels:
    2728        uint8_t R[4];
     
    3839        }
    3940
    4041        //write up to 4 Y pixels:
    41         uint32_t di = (gy * 2 * dstPitch) + gx * 2;
     42        uint32_t di = (gy * 2 * dstHPitch) + gx * 2;
    4243        dstImage[di] = __float2int_rn(0.257 * R[0] + 0.504 * G[0] + 0.098 * B[0] + 16);
    4344        if (gx*2 + 1 < src_w) {
    4445            R[1] = srcImage[si+6];
     
    4748            dstImage[di + 1] = __float2int_rn(0.257 * R[1] + 0.504 * G[1] + 0.098 * B[1] + 16);
    4849        }
    4950        if (gy*2 + 1 < src_h) {
    50             si += srcPitch;
    51             di += dstPitch;
     51            si += srcHPitch;
     52            di += dstHPitch;
    5253            R[2] = srcImage[si+2];
    5354            G[2] = srcImage[si+1];
    5455            B[2] = srcImage[si];
     
    6869            u += -0.148 * R[j] - 0.291 * G[j] + 0.439 * B[j] + 128;
    6970            v +=  0.439 * R[j] - 0.368 * G[j] - 0.071 * B[j] + 128;
    7071        }
    71         di = (dst_h + gy) * dstPitch + gx * 2;
     72        di = (dstVPitch + gy) * dstHPitch + gx * 2;
    7273        dstImage[di]      = __float2int_rn(u / 4.0);
    7374        dstImage[di + 1]  = __float2int_rn(v / 4.0);
    7475    }
  • xpra/codecs/cuda_common/BGRA_to_YUV444.cu

     
    77
    88#include <stdint.h>
    99
    10 extern "C" __global__ void BGRA_to_YUV444(uint8_t *srcImage, int src_w, int src_h, int srcPitch,
    11                              uint8_t *dstImage, int dst_w, int dst_h, int dstPitch,
    12                              int w, int h)
     10extern "C" __global__ void BGRA_to_YUV444(uint8_t *srcImage, int src_w, int src_h, int srcHPitch,
     11                          uint8_t *dstImage, int dst_w, int dst_h,
     12                          int dstHPitch, int dstVPitch,
     13                          int w, int h)
    1314{
    1415    uint32_t gx, gy;
    1516    gx = blockIdx.x * blockDim.x + threadIdx.x;
     
    2324        uint8_t G;
    2425        uint8_t B;
    2526        //one 32-bit RGB pixel at a time:
    26         uint32_t si = (src_y * srcPitch) + src_x * 4;
     27        uint32_t si = (src_y * srcHPitch) + src_x * 4;
    2728        R = srcImage[si+2];
    2829        G = srcImage[si+1];
    2930        B = srcImage[si];
    3031
    3132        uint32_t di;
    32         di = (gy * dstPitch) + gx;
     33        di = (gy * dstHPitch) + gx;
    3334        dstImage[di] = __float2int_rn(0.257 * R + 0.504 * G + 0.098 * B + 16);
    34         di += dstPitch*dst_h;
     35        di += dstHPitch*dstVPitch;
    3536        dstImage[di] = __float2int_rn(-0.148 * R - 0.291 * G + 0.439 * B + 128);
    36         di += dstPitch*dst_h;
     37        di += dstHPitch*dstVPitch;
    3738        dstImage[di] = __float2int_rn(0.439 * R - 0.368 * G - 0.071 * B + 128);
    3839    }
    3940}
  • xpra/codecs/nvenc4/encoder.pyx

     
    4040cdef int YUV444_THRESHOLD = int(os.environ.get("XPRA_NVENC_YUV444_THRESHOLD", "85"))
    4141cdef int LOSSLESS_THRESHOLD = int(os.environ.get("XPRA_NVENC_LOSSLESS_THRESHOLD", "100"))
    4242cdef int DEBUG_API = int(os.environ.get("XPRA_NVENC_DEBUG_API", "0")=="1")
     43cdef int VADJUST = int(os.environ.get("XPRA_NVENC_VADJUST", "0"))
     44cdef int VPAD = int(os.environ.get("XPRA_NVENC_VPAD", "0"))
     45cdef int VROUND = int(os.environ.get("XPRA_NVENC_VROUND", "0"))
    4346
    4447cdef int QP_MAX_VALUE = 51   #newer versions of ffmpeg can decode up to 63
    4548
     
    12951298        self.quality = quality
    12961299        self.scaling = scaling or (1, 1)
    12971300        v, u = self.scaling
    1298         self.input_width = roundup(width, 32)
    1299         self.input_height = roundup(height, 32)
    1300         self.encoder_width = roundup(width*v//u, 32)
    1301         self.encoder_height = roundup(height*v//u, 32)
     1301        self.input_width = roundup(width, 64)
     1302        self.input_height = roundup(height+VPAD, VROUND)
     1303        self.encoder_width = roundup(width*v//u, 64)
     1304        self.encoder_height = roundup((height+VPAD)*v//u, VROUND)
    13021305        self.src_format = src_format
    13031306        self.dst_formats = dst_formats
    13041307        self.codec_name = "H264"
     
    14011404            self.cudaInputBuffer, self.inputPitch = driver.mem_alloc_pitch(max_input_stride, self.input_height, 16)
    14021405            log("CUDA Input Buffer=%#x, pitch=%s", int(self.cudaInputBuffer), self.inputPitch)
    14031406            #allocate CUDA output buffer (on device):
    1404             self.cudaOutputBuffer, self.outputPitch = driver.mem_alloc_pitch(self.encoder_width, self.encoder_height*3//plane_size_div, 16)
     1407            self.cudaOutputBuffer, self.outputPitch = driver.mem_alloc_pitch(self.encoder_width, roundup(self.encoder_height+MAX(0, VADJUST), 64)*3//plane_size_div, 16)
    14051408            log("CUDA Output Buffer=%#x, pitch=%s", int(self.cudaOutputBuffer), self.outputPitch)
    14061409            #allocate input buffer on host:
    14071410            self.inputBuffer = driver.pagelocked_zeros(self.inputPitch*self.input_height, dtype=numpy.byte)
     
    19361939
    19371940        csc_start = time.time()
    19381941        args = (self.cudaInputBuffer, numpy.int32(in_w), numpy.int32(in_h), numpy.int32(stride),
    1939                self.cudaOutputBuffer, numpy.int32(self.encoder_width), numpy.int32(self.encoder_height), numpy.int32(self.outputPitch),
     1942               self.cudaOutputBuffer, numpy.int32(self.encoder_width), numpy.int32(self.encoder_height),
     1943               numpy.int32(self.outputPitch), numpy.int32(MAX(1, self.encoder_height+VADJUST)),
    19401944               numpy.int32(w), numpy.int32(h))
    1941         log("calling %s%s with block=%s, grid=%s", self.kernel, args, (blockw,blockh,1), (gridw, gridh))
     1945        log.info("calling %s%s with block=%s, grid=%s (VADJUST=%s)", self.kernel, args, (blockw,blockh,1), (gridw, gridh), VADJUST)
    19421946        self.kernel(*args, block=(blockw,blockh,1), grid=(gridw, gridh))
    19431947        csc_end = time.time()
    19441948        log("compress_image(..) kernel %s executed - CSC took %.1f ms", self.kernel_name, (csc_end - csc_start)*1000.0)
     
    20012005                r = self.functionList.nvEncEncodePicture(self.context, &picParams)
    20022006            raiseNVENC(r, "error during picture encoding")
    20032007            encode_end = time.time()
    2004             log("compress_image(..) encoded in %.1f ms, info=%s", (encode_end-csc_end)*1000.0, self.get_info())
     2008            log.info("compress_image(..) encoded in %.1f ms, info=%s", (encode_end-csc_end)*1000.0, self.get_info())
    20052009
    20062010            #lock output buffer:
    20072011            memset(&lockOutputBuffer, 0, sizeof(NV_ENC_LOCK_BITSTREAM))