diff --git a/.gitignore b/.gitignore index 8c1430b..a3fdb33 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ client/.parcel-cache client/dist client/node_modules -/server/target \ No newline at end of file +/server/target +/server/testdata \ No newline at end of file diff --git a/server/src/main.rs b/server/src/main.rs index ee3bc01..f10899d 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -22,33 +22,12 @@ use std::{ time::Duration, }; -use rand::distributions::DistString; use std::net::SocketAddr; use tokio::sync::{ - broadcast, mpsc::{self, error::TryRecvError}, Mutex as TokioMutex, }; -use axum::{ - extract::{ - connect_info::ConnectInfo, - ws::{self, Message, WebSocket, WebSocketUpgrade}, - State, - }, - response::IntoResponse, - routing::get, - Router, -}; - -use futures::{sink::SinkExt, stream::StreamExt}; - -#[derive(Clone)] -enum WsMessage { - VideoPacket { packet: ffmpeg::Packet }, - Json(String), -} - struct AppState { encoder_tx: Arc>>, inputs: Arc>>, @@ -184,23 +163,6 @@ where } } -const FLIP_SRC: &str = " -extern \"C\" __global__ void flip_opengl( - const unsigned* pSrc, - unsigned* pDest, - unsigned width, - unsigned height -) { - const unsigned x = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x < width && y < height) { - // let reversed_y = (size.height - 1) - y; - unsigned reversed_y = (height - 1) - y; - pDest[y * width + x] = pSrc[reversed_y * width + x]; - } -}"; - #[tokio::main(flavor = "multi_thread", worker_threads = 2)] async fn main() -> anyhow::Result<()> { // Setup a tracing subscriber @@ -213,13 +175,6 @@ async fn main() -> anyhow::Result<()> { // H.264 encoder related let device = CudaDevice::new(0)?; - let ptx = cudarc::nvrtc::compile_ptx(&FLIP_SRC).with_context(|| "compiling support kernel")?; - - println!("compiled kernel"); - - // pop it in - device.load_ptx(ptx, "module", &["flip_opengl"])?; - let egl_ctx = Arc::new(Mutex::new(DeviceContext::new(0))); let resource = Arc::new(Mutex::new(GraphicsResource::new(&device))); @@ -247,7 +202,7 @@ async fn main() -> anyhow::Result<()> { "cores/swanstation_libretro.so".into(), )); let _ = retro_input_event_tx.blocking_send(retro_thread::RetroInEvent::LoadGame( - "roms/nmv1_us.cue".into(), + "roms/merged/nmv2/jagb/nmv2jagb.cue".into(), )); // start the libretro thread looping now that we're alive @@ -257,63 +212,6 @@ async fn main() -> anyhow::Result<()> { match retro_event_rx.blocking_recv() { Some(msg) => match msg { RetroEvent::Frame => { - /* - let mut same = true; - - { - let mut frame_locked = frame.lock().expect( - "Couldn't lock frame on our end. Did the encoder thread panic?", - ); - - let mut_frame = frame_locked.as_mut().expect("it's None? why?"); - - let width = mut_frame.width(); - let height = mut_frame.height(); - - let mut surf = surface_clone.lock().expect( - "locking the VNC surface to paint it to the ffmpeg frame failed", - ); - - - let surf_buf = surf.get_buffer(); - - let buf_ptr = - unsafe { (*(*mut_frame.as_mut_ptr()).buf[0]).data as *mut u32 }; - - for y in 0..height { - let line_stride = (y * width) as usize; - // Make a slice for the line - // SAFETY: The allocation is guaranteed to be large enough - // for this to work from y = 0..height - let dest_line_slice = unsafe { - let dest_line_ptr = buf_ptr.add(line_stride); - std::slice::from_raw_parts_mut( - dest_line_ptr, - width as usize, - ) - }; - - // If any line differs then the frame isn't the same and should be encoded. - if &surf_buf[line_stride..line_stride + width as usize] - != dest_line_slice - { - same = false; - } - - dest_line_slice.copy_from_slice( - &surf_buf[line_stride..line_stride + width as usize], - ); - } - } - - if !same { - let _ = state_clone - .encoder_tx - .blocking_lock() - .blocking_send(encoder_thread::EncodeThreadInput::SendFrame); - } - */ - let _ = state_clone .encoder_tx .blocking_lock() @@ -321,23 +219,9 @@ async fn main() -> anyhow::Result<()> { } RetroEvent::Resize { size } => { - // make a new frame for the encoder - let _ = state_clone.encoder_tx.blocking_lock().blocking_send( encoder_thread::EncodeThreadInput::Init { size: size.clone() }, ); - - /* - { - let mut lk_frame = frame_clone.lock().expect("Couldn't lock frame"); - - *lk_frame = Some(ffmpeg::frame::Video::new( - ffmpeg::format::Pixel::BGRA, - size.clone().width, - size.clone().height, - )); - } - */ } RetroEvent::WantInputs { tx } => { diff --git a/server/src/video/cuda_gl/safe.rs b/server/src/video/cuda_gl/safe.rs index 11f2b0c..5a1fa60 100644 --- a/server/src/video/cuda_gl/safe.rs +++ b/server/src/video/cuda_gl/safe.rs @@ -48,6 +48,24 @@ impl MappedGraphicsResource { Ok(array) } + + pub fn get_device_pointer(&mut self) -> Result { + assert!( + !self.resource.is_null(), + "do not call GraphicsResource::get_mapped_array if no resource is actually registered" + ); + + let mut array: cuda_sys::CUdeviceptr = 0; + let mut size: usize = 0; + + unsafe { + cuda_sys::lib() + .cuGraphicsResourceGetMappedPointer_v2(&mut array, &mut size, self.resource) + .result()?; + } + + Ok(array) + } } impl Drop for MappedGraphicsResource { diff --git a/server/src/video/encoder_thread.rs b/server/src/video/encoder_thread.rs index d75ec75..dd5f7a4 100644 --- a/server/src/video/encoder_thread.rs +++ b/server/src/video/encoder_thread.rs @@ -1,7 +1,8 @@ -use cudarc::driver::{ +use anyhow::Context; +use cudarc::{driver::{ sys::{CUdeviceptr, CUmemorytype}, - CudaDevice, LaunchAsync, -}; + CudaDevice, CudaSlice, DevicePtr, LaunchAsync, +}, nvrtc::CompileOptions}; use letsplay_gpu::egl_helpers::DeviceContext; use std::{ sync::{Arc, Mutex}, @@ -233,6 +234,23 @@ pub fn encoder_thread_spawn_swframe( (out_rx, in_tx) } +/// Source for the kernel used to flip OpenGL framebuffers right-side up. +const OPENGL_FLIP_KERNEL_SRC: &str = " +extern \"C\" __global__ void flip_opengl( + const unsigned* pSrc, + unsigned* pDest, + int width, + int height +) { + const unsigned x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < width && y < height) { + unsigned reversed_y = (height - 1) - y; + ((unsigned*)pDest)[y * width + x] = ((unsigned*)pSrc)[reversed_y * width + x]; + } +}"; + fn encoder_thread_hwframe_main( mut rx: mpsc::Receiver, tx: mpsc::Sender, @@ -249,7 +267,18 @@ fn encoder_thread_hwframe_main( // :) cuda_device.bind_to_thread()?; - /* + // Compile the support kernel + let ptx = cudarc::nvrtc::compile_ptx_with_opts( + &OPENGL_FLIP_KERNEL_SRC, + CompileOptions { + //options: vec!["--gpu-architecture=compute_50".into()], + ..Default::default() + }, + ) + .with_context(|| "compiling support kernel")?; + + // pop it in + cuda_device.load_ptx(ptx, "module", &["flip_opengl"])?; let mut memcpy = cudarc::driver::sys::CUDA_MEMCPY2D_st::default(); // setup basic src stuff @@ -261,7 +290,9 @@ fn encoder_thread_hwframe_main( memcpy.dstXInBytes = 0; memcpy.dstY = 0; memcpy.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE; - */ + + // Temporary buffer used for opengl flip on the GPU + let mut temp_buffer: CudaSlice = cuda_device.alloc_zeros::(48).expect("over"); loop { match rx.try_recv() { @@ -273,6 +304,10 @@ fn encoder_thread_hwframe_main( force_keyframe = false; } + temp_buffer = cuda_device + .alloc_zeros::((size.width * size.height) as usize) + .expect("oh youre fucked anyways"); + encoder .init(cuda_device, size) .expect("encoder init failed"); @@ -299,53 +334,17 @@ fn encoder_thread_hwframe_main( .get_mapped_array() .expect("well its all over anyways"); + let frame = encoder.frame(); + + // setup the cuMemcpy2D operation to copy to the temporary buffer + // (we should probably abstract source and provide a way to elide this, + // and instead feed ffmpeg directly. for now it's *just* used with gl so /shrug) { - let frame = encoder.frame(); - let width = frame.width(); - let height = frame.height(); - - let launch_config = cudarc::driver::LaunchConfig { - grid_dim: (width / 16 + 1, height / 2 + 1, 0), - block_dim: (16, 2, 0), - shared_mem_bytes: 0, - }; - - let flip_opengl = cuda_device - .get_func("module", "flip_opengl") - .expect("dumb fucker"); - - unsafe { - let frame_ptr = frame.as_mut_ptr(); - - let mut params = [ - array as *mut std::ffi::c_void, - (*frame_ptr).data[0] as CUdeviceptr as *mut std::ffi::c_void, - width as *const u32 as *mut std::ffi::c_void, - height as *const u32 as *mut std::ffi::c_void, - ]; - - flip_opengl - .launch(launch_config, &mut params[..]) - .expect("oh its done"); - - - cudarc::driver::sys::lib() - .cuStreamSynchronize(std::ptr::null_mut()) - .result() - .expect("you banned"); - } - } - - /* - // setup the cuMemcpy2D - { - let frame = encoder.frame(); - memcpy.srcArray = array; unsafe { let frame_ptr = frame.as_mut_ptr(); - memcpy.dstDevice = (*frame_ptr).data[0] as CUdeviceptr; + memcpy.dstDevice = temp_buffer.device_ptr().clone(); memcpy.dstPitch = (*frame_ptr).linesize[0] as usize; memcpy.WidthInBytes = ((*frame_ptr).width * 4) as usize; memcpy.Height = (*frame_ptr).height as usize; @@ -357,13 +356,47 @@ fn encoder_thread_hwframe_main( .cuMemcpy2DAsync_v2(&memcpy, std::ptr::null_mut()) .result() .expect("cuMemcpy2D fail epic"); + } - cudarc::driver::sys::lib() - .cuStreamSynchronize(std::ptr::null_mut()) - .result() - .expect("you banned"); + // launch kernel to flip the opengl framebuffer right-side up + { + let width = frame.width(); + let height = frame.height(); - } */ + let launch_config = cudarc::driver::LaunchConfig { + grid_dim: (width / 16 + 1, height / 2 + 1, 1), + block_dim: (16, 2, 1), + shared_mem_bytes: 0, + }; + + let flip_opengl = cuda_device + .get_func("module", "flip_opengl") + .expect("dumb fucker"); + + unsafe { + let frame_ptr = frame.as_mut_ptr(); + + let mut slice = cuda_device.upgrade_device_ptr::( + (*frame_ptr).data[0] as CUdeviceptr, + (width * height) as usize * 4usize, + ); + + flip_opengl + .launch( + launch_config, + (&mut temp_buffer, &mut slice, width, height), + ) + .expect("I hate you"); + + cudarc::driver::sys::lib() + .cuStreamSynchronize(std::ptr::null_mut()) + .result() + .expect("you banned"); + + // leak so it doesnt free the memory like a dumbass + let _ = slice.leak(); + } + } mapped.unmap().expect("fuck you asshole"); gl_ctx.release(); diff --git a/server/src/video/h264_encoder.rs b/server/src/video/h264_encoder.rs index 313fbfb..583ddfd 100644 --- a/server/src/video/h264_encoder.rs +++ b/server/src/video/h264_encoder.rs @@ -163,6 +163,10 @@ impl H264Encoder { Ok(Self::NvencSWFrame { encoder: encoder }) } + /// Creates a new hardware (NVIDIA NVENC) encoder, which encodes + /// frames from GPU memory, via CUDA. + /// FFmpeg handles uploading frames to the GPU. + /// You are expected to handle uploading or otherwise working with a frame on the GPU. pub fn new_nvenc_hwframe( cuda_device: &CudaDevice, size: Size, @@ -188,8 +192,8 @@ impl H264Encoder { video_encoder_context.set_format(ffmpeg::format::Pixel::CUDA); - video_encoder_context.set_qmin(37); - video_encoder_context.set_qmax(33); + video_encoder_context.set_qmin(35); + video_encoder_context.set_qmax(30); unsafe { // FIXME: this currently breaks the avbufferref system a bit