gpu-only encoding works now yay
This commit is contained in:
parent
09142f9668
commit
2ecf9af167
5 changed files with 112 additions and 172 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -3,3 +3,4 @@ client/dist
|
||||||
client/node_modules
|
client/node_modules
|
||||||
|
|
||||||
/server/target
|
/server/target
|
||||||
|
/server/testdata
|
|
@ -22,33 +22,12 @@ use std::{
|
||||||
time::Duration,
|
time::Duration,
|
||||||
};
|
};
|
||||||
|
|
||||||
use rand::distributions::DistString;
|
|
||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
use tokio::sync::{
|
use tokio::sync::{
|
||||||
broadcast,
|
|
||||||
mpsc::{self, error::TryRecvError},
|
mpsc::{self, error::TryRecvError},
|
||||||
Mutex as TokioMutex,
|
Mutex as TokioMutex,
|
||||||
};
|
};
|
||||||
|
|
||||||
use axum::{
|
|
||||||
extract::{
|
|
||||||
connect_info::ConnectInfo,
|
|
||||||
ws::{self, Message, WebSocket, WebSocketUpgrade},
|
|
||||||
State,
|
|
||||||
},
|
|
||||||
response::IntoResponse,
|
|
||||||
routing::get,
|
|
||||||
Router,
|
|
||||||
};
|
|
||||||
|
|
||||||
use futures::{sink::SinkExt, stream::StreamExt};
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
enum WsMessage {
|
|
||||||
VideoPacket { packet: ffmpeg::Packet },
|
|
||||||
Json(String),
|
|
||||||
}
|
|
||||||
|
|
||||||
struct AppState<T: Transport> {
|
struct AppState<T: Transport> {
|
||||||
encoder_tx: Arc<TokioMutex<mpsc::Sender<EncodeThreadInput>>>,
|
encoder_tx: Arc<TokioMutex<mpsc::Sender<EncodeThreadInput>>>,
|
||||||
inputs: Arc<TokioMutex<Vec<u32>>>,
|
inputs: Arc<TokioMutex<Vec<u32>>>,
|
||||||
|
@ -184,23 +163,6 @@ where
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const FLIP_SRC: &str = "
|
|
||||||
extern \"C\" __global__ void flip_opengl(
|
|
||||||
const unsigned* pSrc,
|
|
||||||
unsigned* pDest,
|
|
||||||
unsigned width,
|
|
||||||
unsigned height
|
|
||||||
) {
|
|
||||||
const unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
const unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
||||||
|
|
||||||
if (x < width && y < height) {
|
|
||||||
// let reversed_y = (size.height - 1) - y;
|
|
||||||
unsigned reversed_y = (height - 1) - y;
|
|
||||||
pDest[y * width + x] = pSrc[reversed_y * width + x];
|
|
||||||
}
|
|
||||||
}";
|
|
||||||
|
|
||||||
#[tokio::main(flavor = "multi_thread", worker_threads = 2)]
|
#[tokio::main(flavor = "multi_thread", worker_threads = 2)]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
// Setup a tracing subscriber
|
// Setup a tracing subscriber
|
||||||
|
@ -213,13 +175,6 @@ async fn main() -> anyhow::Result<()> {
|
||||||
// H.264 encoder related
|
// H.264 encoder related
|
||||||
let device = CudaDevice::new(0)?;
|
let device = CudaDevice::new(0)?;
|
||||||
|
|
||||||
let ptx = cudarc::nvrtc::compile_ptx(&FLIP_SRC).with_context(|| "compiling support kernel")?;
|
|
||||||
|
|
||||||
println!("compiled kernel");
|
|
||||||
|
|
||||||
// pop it in
|
|
||||||
device.load_ptx(ptx, "module", &["flip_opengl"])?;
|
|
||||||
|
|
||||||
let egl_ctx = Arc::new(Mutex::new(DeviceContext::new(0)));
|
let egl_ctx = Arc::new(Mutex::new(DeviceContext::new(0)));
|
||||||
|
|
||||||
let resource = Arc::new(Mutex::new(GraphicsResource::new(&device)));
|
let resource = Arc::new(Mutex::new(GraphicsResource::new(&device)));
|
||||||
|
@ -247,7 +202,7 @@ async fn main() -> anyhow::Result<()> {
|
||||||
"cores/swanstation_libretro.so".into(),
|
"cores/swanstation_libretro.so".into(),
|
||||||
));
|
));
|
||||||
let _ = retro_input_event_tx.blocking_send(retro_thread::RetroInEvent::LoadGame(
|
let _ = retro_input_event_tx.blocking_send(retro_thread::RetroInEvent::LoadGame(
|
||||||
"roms/nmv1_us.cue".into(),
|
"roms/merged/nmv2/jagb/nmv2jagb.cue".into(),
|
||||||
));
|
));
|
||||||
|
|
||||||
// start the libretro thread looping now that we're alive
|
// start the libretro thread looping now that we're alive
|
||||||
|
@ -257,63 +212,6 @@ async fn main() -> anyhow::Result<()> {
|
||||||
match retro_event_rx.blocking_recv() {
|
match retro_event_rx.blocking_recv() {
|
||||||
Some(msg) => match msg {
|
Some(msg) => match msg {
|
||||||
RetroEvent::Frame => {
|
RetroEvent::Frame => {
|
||||||
/*
|
|
||||||
let mut same = true;
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut frame_locked = frame.lock().expect(
|
|
||||||
"Couldn't lock frame on our end. Did the encoder thread panic?",
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut_frame = frame_locked.as_mut().expect("it's None? why?");
|
|
||||||
|
|
||||||
let width = mut_frame.width();
|
|
||||||
let height = mut_frame.height();
|
|
||||||
|
|
||||||
let mut surf = surface_clone.lock().expect(
|
|
||||||
"locking the VNC surface to paint it to the ffmpeg frame failed",
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
let surf_buf = surf.get_buffer();
|
|
||||||
|
|
||||||
let buf_ptr =
|
|
||||||
unsafe { (*(*mut_frame.as_mut_ptr()).buf[0]).data as *mut u32 };
|
|
||||||
|
|
||||||
for y in 0..height {
|
|
||||||
let line_stride = (y * width) as usize;
|
|
||||||
// Make a slice for the line
|
|
||||||
// SAFETY: The allocation is guaranteed to be large enough
|
|
||||||
// for this to work from y = 0..height
|
|
||||||
let dest_line_slice = unsafe {
|
|
||||||
let dest_line_ptr = buf_ptr.add(line_stride);
|
|
||||||
std::slice::from_raw_parts_mut(
|
|
||||||
dest_line_ptr,
|
|
||||||
width as usize,
|
|
||||||
)
|
|
||||||
};
|
|
||||||
|
|
||||||
// If any line differs then the frame isn't the same and should be encoded.
|
|
||||||
if &surf_buf[line_stride..line_stride + width as usize]
|
|
||||||
!= dest_line_slice
|
|
||||||
{
|
|
||||||
same = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
dest_line_slice.copy_from_slice(
|
|
||||||
&surf_buf[line_stride..line_stride + width as usize],
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !same {
|
|
||||||
let _ = state_clone
|
|
||||||
.encoder_tx
|
|
||||||
.blocking_lock()
|
|
||||||
.blocking_send(encoder_thread::EncodeThreadInput::SendFrame);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
let _ = state_clone
|
let _ = state_clone
|
||||||
.encoder_tx
|
.encoder_tx
|
||||||
.blocking_lock()
|
.blocking_lock()
|
||||||
|
@ -321,23 +219,9 @@ async fn main() -> anyhow::Result<()> {
|
||||||
}
|
}
|
||||||
|
|
||||||
RetroEvent::Resize { size } => {
|
RetroEvent::Resize { size } => {
|
||||||
// make a new frame for the encoder
|
|
||||||
|
|
||||||
let _ = state_clone.encoder_tx.blocking_lock().blocking_send(
|
let _ = state_clone.encoder_tx.blocking_lock().blocking_send(
|
||||||
encoder_thread::EncodeThreadInput::Init { size: size.clone() },
|
encoder_thread::EncodeThreadInput::Init { size: size.clone() },
|
||||||
);
|
);
|
||||||
|
|
||||||
/*
|
|
||||||
{
|
|
||||||
let mut lk_frame = frame_clone.lock().expect("Couldn't lock frame");
|
|
||||||
|
|
||||||
*lk_frame = Some(ffmpeg::frame::Video::new(
|
|
||||||
ffmpeg::format::Pixel::BGRA,
|
|
||||||
size.clone().width,
|
|
||||||
size.clone().height,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
RetroEvent::WantInputs { tx } => {
|
RetroEvent::WantInputs { tx } => {
|
||||||
|
|
|
@ -48,6 +48,24 @@ impl MappedGraphicsResource {
|
||||||
|
|
||||||
Ok(array)
|
Ok(array)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_device_pointer(&mut self) -> Result<cuda_sys::CUdeviceptr, cuda_result::DriverError> {
|
||||||
|
assert!(
|
||||||
|
!self.resource.is_null(),
|
||||||
|
"do not call GraphicsResource::get_mapped_array if no resource is actually registered"
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut array: cuda_sys::CUdeviceptr = 0;
|
||||||
|
let mut size: usize = 0;
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
cuda_sys::lib()
|
||||||
|
.cuGraphicsResourceGetMappedPointer_v2(&mut array, &mut size, self.resource)
|
||||||
|
.result()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(array)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for MappedGraphicsResource {
|
impl Drop for MappedGraphicsResource {
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
use cudarc::driver::{
|
use anyhow::Context;
|
||||||
|
use cudarc::{driver::{
|
||||||
sys::{CUdeviceptr, CUmemorytype},
|
sys::{CUdeviceptr, CUmemorytype},
|
||||||
CudaDevice, LaunchAsync,
|
CudaDevice, CudaSlice, DevicePtr, LaunchAsync,
|
||||||
};
|
}, nvrtc::CompileOptions};
|
||||||
use letsplay_gpu::egl_helpers::DeviceContext;
|
use letsplay_gpu::egl_helpers::DeviceContext;
|
||||||
use std::{
|
use std::{
|
||||||
sync::{Arc, Mutex},
|
sync::{Arc, Mutex},
|
||||||
|
@ -233,6 +234,23 @@ pub fn encoder_thread_spawn_swframe(
|
||||||
(out_rx, in_tx)
|
(out_rx, in_tx)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Source for the kernel used to flip OpenGL framebuffers right-side up.
|
||||||
|
const OPENGL_FLIP_KERNEL_SRC: &str = "
|
||||||
|
extern \"C\" __global__ void flip_opengl(
|
||||||
|
const unsigned* pSrc,
|
||||||
|
unsigned* pDest,
|
||||||
|
int width,
|
||||||
|
int height
|
||||||
|
) {
|
||||||
|
const unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
const unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
|
if (x < width && y < height) {
|
||||||
|
unsigned reversed_y = (height - 1) - y;
|
||||||
|
((unsigned*)pDest)[y * width + x] = ((unsigned*)pSrc)[reversed_y * width + x];
|
||||||
|
}
|
||||||
|
}";
|
||||||
|
|
||||||
fn encoder_thread_hwframe_main(
|
fn encoder_thread_hwframe_main(
|
||||||
mut rx: mpsc::Receiver<EncodeThreadInput>,
|
mut rx: mpsc::Receiver<EncodeThreadInput>,
|
||||||
tx: mpsc::Sender<EncodeThreadOutput>,
|
tx: mpsc::Sender<EncodeThreadOutput>,
|
||||||
|
@ -249,7 +267,18 @@ fn encoder_thread_hwframe_main(
|
||||||
// :)
|
// :)
|
||||||
cuda_device.bind_to_thread()?;
|
cuda_device.bind_to_thread()?;
|
||||||
|
|
||||||
/*
|
// Compile the support kernel
|
||||||
|
let ptx = cudarc::nvrtc::compile_ptx_with_opts(
|
||||||
|
&OPENGL_FLIP_KERNEL_SRC,
|
||||||
|
CompileOptions {
|
||||||
|
//options: vec!["--gpu-architecture=compute_50".into()],
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.with_context(|| "compiling support kernel")?;
|
||||||
|
|
||||||
|
// pop it in
|
||||||
|
cuda_device.load_ptx(ptx, "module", &["flip_opengl"])?;
|
||||||
|
|
||||||
let mut memcpy = cudarc::driver::sys::CUDA_MEMCPY2D_st::default();
|
let mut memcpy = cudarc::driver::sys::CUDA_MEMCPY2D_st::default();
|
||||||
// setup basic src stuff
|
// setup basic src stuff
|
||||||
|
@ -261,7 +290,9 @@ fn encoder_thread_hwframe_main(
|
||||||
memcpy.dstXInBytes = 0;
|
memcpy.dstXInBytes = 0;
|
||||||
memcpy.dstY = 0;
|
memcpy.dstY = 0;
|
||||||
memcpy.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
|
memcpy.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
|
||||||
*/
|
|
||||||
|
// Temporary buffer used for opengl flip on the GPU
|
||||||
|
let mut temp_buffer: CudaSlice<u32> = cuda_device.alloc_zeros::<u32>(48).expect("over");
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
match rx.try_recv() {
|
match rx.try_recv() {
|
||||||
|
@ -273,6 +304,10 @@ fn encoder_thread_hwframe_main(
|
||||||
force_keyframe = false;
|
force_keyframe = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
temp_buffer = cuda_device
|
||||||
|
.alloc_zeros::<u32>((size.width * size.height) as usize)
|
||||||
|
.expect("oh youre fucked anyways");
|
||||||
|
|
||||||
encoder
|
encoder
|
||||||
.init(cuda_device, size)
|
.init(cuda_device, size)
|
||||||
.expect("encoder init failed");
|
.expect("encoder init failed");
|
||||||
|
@ -299,53 +334,17 @@ fn encoder_thread_hwframe_main(
|
||||||
.get_mapped_array()
|
.get_mapped_array()
|
||||||
.expect("well its all over anyways");
|
.expect("well its all over anyways");
|
||||||
|
|
||||||
|
let frame = encoder.frame();
|
||||||
|
|
||||||
|
// setup the cuMemcpy2D operation to copy to the temporary buffer
|
||||||
|
// (we should probably abstract source and provide a way to elide this,
|
||||||
|
// and instead feed ffmpeg directly. for now it's *just* used with gl so /shrug)
|
||||||
{
|
{
|
||||||
let frame = encoder.frame();
|
|
||||||
let width = frame.width();
|
|
||||||
let height = frame.height();
|
|
||||||
|
|
||||||
let launch_config = cudarc::driver::LaunchConfig {
|
|
||||||
grid_dim: (width / 16 + 1, height / 2 + 1, 0),
|
|
||||||
block_dim: (16, 2, 0),
|
|
||||||
shared_mem_bytes: 0,
|
|
||||||
};
|
|
||||||
|
|
||||||
let flip_opengl = cuda_device
|
|
||||||
.get_func("module", "flip_opengl")
|
|
||||||
.expect("dumb fucker");
|
|
||||||
|
|
||||||
unsafe {
|
|
||||||
let frame_ptr = frame.as_mut_ptr();
|
|
||||||
|
|
||||||
let mut params = [
|
|
||||||
array as *mut std::ffi::c_void,
|
|
||||||
(*frame_ptr).data[0] as CUdeviceptr as *mut std::ffi::c_void,
|
|
||||||
width as *const u32 as *mut std::ffi::c_void,
|
|
||||||
height as *const u32 as *mut std::ffi::c_void,
|
|
||||||
];
|
|
||||||
|
|
||||||
flip_opengl
|
|
||||||
.launch(launch_config, &mut params[..])
|
|
||||||
.expect("oh its done");
|
|
||||||
|
|
||||||
|
|
||||||
cudarc::driver::sys::lib()
|
|
||||||
.cuStreamSynchronize(std::ptr::null_mut())
|
|
||||||
.result()
|
|
||||||
.expect("you banned");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
// setup the cuMemcpy2D
|
|
||||||
{
|
|
||||||
let frame = encoder.frame();
|
|
||||||
|
|
||||||
memcpy.srcArray = array;
|
memcpy.srcArray = array;
|
||||||
|
|
||||||
unsafe {
|
unsafe {
|
||||||
let frame_ptr = frame.as_mut_ptr();
|
let frame_ptr = frame.as_mut_ptr();
|
||||||
memcpy.dstDevice = (*frame_ptr).data[0] as CUdeviceptr;
|
memcpy.dstDevice = temp_buffer.device_ptr().clone();
|
||||||
memcpy.dstPitch = (*frame_ptr).linesize[0] as usize;
|
memcpy.dstPitch = (*frame_ptr).linesize[0] as usize;
|
||||||
memcpy.WidthInBytes = ((*frame_ptr).width * 4) as usize;
|
memcpy.WidthInBytes = ((*frame_ptr).width * 4) as usize;
|
||||||
memcpy.Height = (*frame_ptr).height as usize;
|
memcpy.Height = (*frame_ptr).height as usize;
|
||||||
|
@ -357,13 +356,47 @@ fn encoder_thread_hwframe_main(
|
||||||
.cuMemcpy2DAsync_v2(&memcpy, std::ptr::null_mut())
|
.cuMemcpy2DAsync_v2(&memcpy, std::ptr::null_mut())
|
||||||
.result()
|
.result()
|
||||||
.expect("cuMemcpy2D fail epic");
|
.expect("cuMemcpy2D fail epic");
|
||||||
|
}
|
||||||
|
|
||||||
cudarc::driver::sys::lib()
|
// launch kernel to flip the opengl framebuffer right-side up
|
||||||
.cuStreamSynchronize(std::ptr::null_mut())
|
{
|
||||||
.result()
|
let width = frame.width();
|
||||||
.expect("you banned");
|
let height = frame.height();
|
||||||
|
|
||||||
} */
|
let launch_config = cudarc::driver::LaunchConfig {
|
||||||
|
grid_dim: (width / 16 + 1, height / 2 + 1, 1),
|
||||||
|
block_dim: (16, 2, 1),
|
||||||
|
shared_mem_bytes: 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
let flip_opengl = cuda_device
|
||||||
|
.get_func("module", "flip_opengl")
|
||||||
|
.expect("dumb fucker");
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
let frame_ptr = frame.as_mut_ptr();
|
||||||
|
|
||||||
|
let mut slice = cuda_device.upgrade_device_ptr::<u32>(
|
||||||
|
(*frame_ptr).data[0] as CUdeviceptr,
|
||||||
|
(width * height) as usize * 4usize,
|
||||||
|
);
|
||||||
|
|
||||||
|
flip_opengl
|
||||||
|
.launch(
|
||||||
|
launch_config,
|
||||||
|
(&mut temp_buffer, &mut slice, width, height),
|
||||||
|
)
|
||||||
|
.expect("I hate you");
|
||||||
|
|
||||||
|
cudarc::driver::sys::lib()
|
||||||
|
.cuStreamSynchronize(std::ptr::null_mut())
|
||||||
|
.result()
|
||||||
|
.expect("you banned");
|
||||||
|
|
||||||
|
// leak so it doesnt free the memory like a dumbass
|
||||||
|
let _ = slice.leak();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
mapped.unmap().expect("fuck you asshole");
|
mapped.unmap().expect("fuck you asshole");
|
||||||
gl_ctx.release();
|
gl_ctx.release();
|
||||||
|
|
|
@ -163,6 +163,10 @@ impl H264Encoder {
|
||||||
Ok(Self::NvencSWFrame { encoder: encoder })
|
Ok(Self::NvencSWFrame { encoder: encoder })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Creates a new hardware (NVIDIA NVENC) encoder, which encodes
|
||||||
|
/// frames from GPU memory, via CUDA.
|
||||||
|
/// FFmpeg handles uploading frames to the GPU.
|
||||||
|
/// You are expected to handle uploading or otherwise working with a frame on the GPU.
|
||||||
pub fn new_nvenc_hwframe(
|
pub fn new_nvenc_hwframe(
|
||||||
cuda_device: &CudaDevice,
|
cuda_device: &CudaDevice,
|
||||||
size: Size,
|
size: Size,
|
||||||
|
@ -188,8 +192,8 @@ impl H264Encoder {
|
||||||
|
|
||||||
video_encoder_context.set_format(ffmpeg::format::Pixel::CUDA);
|
video_encoder_context.set_format(ffmpeg::format::Pixel::CUDA);
|
||||||
|
|
||||||
video_encoder_context.set_qmin(37);
|
video_encoder_context.set_qmin(35);
|
||||||
video_encoder_context.set_qmax(33);
|
video_encoder_context.set_qmax(30);
|
||||||
|
|
||||||
unsafe {
|
unsafe {
|
||||||
// FIXME: this currently breaks the avbufferref system a bit
|
// FIXME: this currently breaks the avbufferref system a bit
|
||||||
|
|
Loading…
Reference in a new issue