working gpu-only encode

2024-10-14 07:25:59 -04:00 · 2024-10-14 07:25:59 -04:00 · 09142f9668
commit 09142f9668
parent 9248fe91a9
14 changed files with 920 additions and 277 deletions
--- a/server/Cargo.lock
+++ b/server/Cargo.lock
@ -1414,6 +1414,7 @@ dependencies = [
 "futures-util",
 "gl",
 "letsplay_gpu",
 "libloading",
 "rand",
 "retro_frontend",
 "serde",
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@ -32,6 +32,7 @@ tracing = "0.1.40"
 tracing-subscriber = "0.3.18"
 xkeysym = "0.2.1"
 async-trait = "0.1.83"
 libloading = "0.8.5"
 [patch.crates-io]
--- a/server/src/main.rs
+++ b/server/src/main.rs
@ -5,10 +5,15 @@ mod video;
 mod transport;
 use anyhow::Context;
 use async_trait::async_trait;
 use cudarc::driver::CudaDevice;
 use letsplay_gpu::egl_helpers::DeviceContext;
 use retro_thread::{spawn_retro_thread, RetroEvent};
 use transport::websocket::WebsocketTransport;
 use transport::{Transport, TransportReciever};
 use video::cuda_gl::safe::GraphicsResource;
 use video::encoder_thread::EncodeThreadInput;
 use video::{encoder_thread, ffmpeg};
@ -44,19 +49,19 @@ enum WsMessage {
    Json(String),
 }
-struct AppState {
+struct AppState<T: Transport> {
    encoder_tx: Arc<TokioMutex<mpsc::Sender<EncodeThreadInput>>>,
    inputs: Arc<TokioMutex<Vec<u32>>>,
-    transport: Arc<crate::transport::websocket::WebsocketTransport>,
+    transport: Arc<T>,
    connection_count: TokioMutex<usize>,
 }
-impl AppState {
+impl<T> AppState<T>
-    fn new(
+where
-        encoder_tx: mpsc::Sender<EncodeThreadInput>,
+    T: Transport + Send + Sync + 'static,
-        transport: Arc<crate::transport::websocket::WebsocketTransport>,
+{
-    ) -> Self {
+    fn new(encoder_tx: mpsc::Sender<EncodeThreadInput>, transport: Arc<T>) -> Self {
        Self {
            encoder_tx: Arc::new(TokioMutex::new(encoder_tx)),
            inputs: Arc::new(TokioMutex::new(Vec::new())),
@ -67,7 +72,10 @@ impl AppState {
 }
 #[async_trait]
-impl TransportReciever for AppState {
+impl<T> TransportReciever for AppState<T>
 where
    T: Transport + Send + Sync + 'static,
 {
    async fn on_connect(&self, username: &String) -> anyhow::Result<()> {
        println!("{username} joined!");
@ -105,7 +113,6 @@ impl TransportReciever for AppState {
                        "msg": json["msg"].as_str().unwrap()
                    });
                    self.transport
                        .broadcast_message(transport::TransportMessage::Text(
                            serde_json::to_string(&send).expect("oh well"),
@ -177,6 +184,23 @@ impl TransportReciever for AppState {
    }
 }
 const FLIP_SRC: &str = "
 extern \"C\" __global__ void flip_opengl(
    const unsigned* pSrc,
    unsigned* pDest,
    unsigned width, 
    unsigned height
 ) {
    const unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x < width && y < height) {
        // let reversed_y = (size.height - 1) - y;
        unsigned reversed_y = (height - 1) - y;
        pDest[y * width + x] = pSrc[reversed_y * width + x];
    }
 }";
 #[tokio::main(flavor = "multi_thread", worker_threads = 2)]
 async fn main() -> anyhow::Result<()> {
    // Setup a tracing subscriber
@ -186,36 +210,56 @@ async fn main() -> anyhow::Result<()> {
    tracing::subscriber::set_global_default(subscriber).unwrap();
    let surface = Arc::new(Mutex::new(surface::Surface::new()));
    // H.264 encoder related
-    let frame: Arc<Mutex<Option<ffmpeg::frame::Video>>> = Arc::new(Mutex::new(None));
+    let device = CudaDevice::new(0)?;
    let (mut encoder_rx, encoder_tx) = encoder_thread::encoder_thread_spawn(&frame);
-    let transport = Arc::new(crate::transport::websocket::WebsocketTransport::new());
+    let ptx = cudarc::nvrtc::compile_ptx(&FLIP_SRC).with_context(|| "compiling support kernel")?;
    println!("compiled kernel");
    // pop it in
    device.load_ptx(ptx, "module", &["flip_opengl"])?;
    let egl_ctx = Arc::new(Mutex::new(DeviceContext::new(0)));
    let resource = Arc::new(Mutex::new(GraphicsResource::new(&device)));
    let (mut encoder_rx, encoder_tx) = encoder_thread::encoder_thread_spawn_hwframe(
        &device.clone(),
        &resource.clone(),
        &egl_ctx.clone(),
    );
    let transport = Arc::new(WebsocketTransport::new());
    let state = Arc::new(AppState::new(encoder_tx, transport.clone()));
-    let (mut event_rx, event_in_tx) = spawn_retro_thread(surface.clone());
+    let (mut retro_event_rx, retro_input_event_tx) =
        spawn_retro_thread(egl_ctx.clone(), resource.clone());
    let state_clone = state.clone();
    let transport_clone = transport.clone();
    // retro event handler. drives the encoder thread too
    let _ = std::thread::Builder::new()
        .name("retro_event_rx".into())
        .spawn(move || {
-            let surface_clone = surface.clone();
+            // load game
-            let frame_clone = frame.clone();
+            let _ = retro_input_event_tx.blocking_send(retro_thread::RetroInEvent::LoadCore(
                "cores/swanstation_libretro.so".into(),
            ));
            let _ = retro_input_event_tx.blocking_send(retro_thread::RetroInEvent::LoadGame(
                "roms/nmv1_us.cue".into(),
            ));
            // start the libretro thread looping now that we're alive
-            let _ = event_in_tx.blocking_send(retro_thread::RetroInEvent::Start);
+            let _ = retro_input_event_tx.blocking_send(retro_thread::RetroInEvent::Start);
            loop {
-                match event_rx.try_recv() {
+                match retro_event_rx.blocking_recv() {
-                    Ok(msg) => match msg {
+                    Some(msg) => match msg {
                        RetroEvent::Frame => {
                            /*
                            let mut same = true;
                            {
                                let mut frame_locked = frame.lock().expect(
                                    "Couldn't lock frame on our end. Did the encoder thread panic?",
@ -229,6 +273,8 @@ async fn main() -> anyhow::Result<()> {
                                let mut surf = surface_clone.lock().expect(
                                "locking the VNC surface to paint it to the ffmpeg frame failed",
                            );
                                let surf_buf = surf.get_buffer();
                                let buf_ptr =
@ -247,12 +293,27 @@ async fn main() -> anyhow::Result<()> {
                                        )
                                    };
                                    // If any line differs then the frame isn't the same and should be encoded.
                                    if &surf_buf[line_stride..line_stride + width as usize]
                                        != dest_line_slice
                                    {
                                        same = false;
                                    }
                                    dest_line_slice.copy_from_slice(
                                        &surf_buf[line_stride..line_stride + width as usize],
                                    );
                                }
                            }
                            if !same {
                                let _ = state_clone
                                    .encoder_tx
                                    .blocking_lock()
                                    .blocking_send(encoder_thread::EncodeThreadInput::SendFrame);
                            }
                            */
                            let _ = state_clone
                                .encoder_tx
                                .blocking_lock()
@ -261,6 +322,12 @@ async fn main() -> anyhow::Result<()> {
                        RetroEvent::Resize { size } => {
                            // make a new frame for the encoder
                            let _ = state_clone.encoder_tx.blocking_lock().blocking_send(
                                encoder_thread::EncodeThreadInput::Init { size: size.clone() },
                            );
                            /*
                            {
                                let mut lk_frame = frame_clone.lock().expect("Couldn't lock frame");
@ -270,30 +337,21 @@ async fn main() -> anyhow::Result<()> {
                                    size.clone().height,
                                ));
                            }
-
+                            */
                            let _ = state_clone.encoder_tx.blocking_lock().blocking_send(
                                encoder_thread::EncodeThreadInput::Init { size: size.clone() },
                            );
                        }
                        RetroEvent::WantInputs { tx } => {
                            let inputs = state_clone.inputs.blocking_lock();
                            //tracing::info!("giving inputs {:?}", inputs);
                            tx.send(inputs.clone()).expect("FUCK");
                        }
                    },
-                    Err(TryRecvError::Disconnected) => break,
+                    None => break,
                    Err(TryRecvError::Empty) => {}
                }
                match encoder_rx.try_recv() {
                    Ok(msg) => match msg {
                        encoder_thread::EncodeThreadOutput::Frame { packet } => {
                            // let _ = state_clone
                            //     .websocket_broadcast_tx
                            //     .send(WsMessage::VideoPacket { packet });
                            // :(
                            let packet_data = {
                                let slice = packet.data().expect(
@ -301,7 +359,9 @@ async fn main() -> anyhow::Result<()> {
                                );
                                slice.to_vec()
                            };
-                            let _ = transport_clone.broadcast_message(transport::TransportMessage::Binary(packet_data));
+                            let _ = state_clone.transport.broadcast_message(
                                transport::TransportMessage::Binary(packet_data),
                            );
                        }
                    },
                    Err(TryRecvError::Empty) => {}
--- a/server/src/retro_thread.rs
+++ b/server/src/retro_thread.rs
@ -17,7 +17,7 @@ use retro_frontend::{
 use gpu::egl_helpers::DeviceContext;
 use letsplay_gpu as gpu;
-use crate::{surface::Surface, types::Size};
+use crate::{surface::Surface, types::Size, video::cuda_gl::safe::GraphicsResource};
 /// Called by OpenGL. We use this to dump errors.
 extern "system" fn opengl_message_callback(
@ -49,30 +49,38 @@ pub struct RetroState {
    pad: RetroPad,
    // EGL state
-    egl_context: Option<DeviceContext>,
+    egl_context: Arc<Mutex<DeviceContext>>,
    /// Locked framebuffer.
-    framebuffer: Arc<Mutex<Surface>>,
+    software_framebuffer: Surface,
    /// OpenGL FBO
    gl_framebuffer: gpu::GlFramebuffer,
-    /// Cached readback buffer.
+    gl_rendering: bool,
-    readback_buffer: Surface,
+
    cuda_resource: Arc<Mutex<GraphicsResource>>,
    event_tx: mpsc::Sender<RetroEvent>,
 }
 impl RetroState {
-    pub fn new(framebuffer: Arc<Mutex<Surface>>, event_tx: mpsc::Sender<RetroEvent>) -> Box<Self> {
+    pub fn new(
        device_context: Arc<Mutex<DeviceContext>>,
        resource: Arc<Mutex<GraphicsResource>>,
        event_tx: mpsc::Sender<RetroEvent>,
    ) -> Box<Self> {
        let mut boxed = Box::new(Self {
            frontend: None,
            pad: RetroPad::new(),
-            egl_context: None,
+            egl_context: device_context.clone(),
-            framebuffer,
+            software_framebuffer: Surface::new(),
            gl_framebuffer: gpu::GlFramebuffer::new(),
-            readback_buffer: Surface::new(),
+            gl_rendering: false,
            cuda_resource: resource.clone(),
            event_tx,
        });
@ -127,14 +135,29 @@ impl RetroState {
    /// Initalizes the headless EGL context used for OpenGL rendering.
    fn hw_gl_egl_init(&mut self) {
-        self.egl_context = Some(DeviceContext::new(0));
+        self.egl_context.lock().expect("piss").make_current();
        unsafe {
            // Load OpenGL functions using the EGL loader.
            gl::load_with(|s| {
                let str = std::ffi::CString::new(s).expect("gl::load_with fail");
                std::mem::transmute(gpu::egl::GetProcAddress(str.as_ptr()))
            });
            // set OpenGL debug message callback
            gl::Enable(gl::DEBUG_OUTPUT);
            gl::DebugMessageCallback(Some(opengl_message_callback), std::ptr::null());
        }
    }
    /// Destroys OpenGL resources and the EGL context.
    fn hw_gl_destroy(&mut self) {
-        if self.egl_context.is_some() {
+        self.gl_framebuffer.destroy();
-            self.gl_framebuffer.destroy();
+
-            self.egl_context.take().unwrap().destroy()
+        {
            let mut locked = self.egl_context.lock().expect("piss");
            locked.release();
            locked.destroy();
        }
    }
@ -143,59 +166,39 @@ impl RetroState {
        let step_ms = (1.0 / av_info.timing.fps) * 1000.;
        let step_duration = Duration::from_millis(step_ms as u64);
-        self.get_frontend().run_frame();
+        {
            let egl = (&mut self.egl_context).clone();
            let locked_egl = egl.lock().expect("fuck YOU");
            locked_egl.make_current();
            self.get_frontend().run_frame();
            locked_egl.release();
        }
        std::thread::sleep(step_duration);
    }
    /// Bleh, I don't like this is an associated fn, but whatever
-    fn update_impl(framebuffer: Arc<Mutex<Surface>>, slice: &[u32], pitch: u32, from_opengl: bool) {
+    fn update_software_framebuffer(framebuffer: &mut Surface, slice: &[u32], pitch: u32) {
-        let mut framebuffer_locked = framebuffer.lock().expect("could not lock framebuffer");
+        let size = framebuffer.size.clone();
-
+        let buffer = framebuffer.get_buffer();
        let size = framebuffer_locked.size.clone();
        let buffer = framebuffer_locked.get_buffer();
        let has_disconnected_pitch = pitch != size.width as u32;
-        // If this frame came from OpenGL we need to flip the image around
+        for y in 0..size.height {
-        // so it is right side up (from our perspective).
+            let src_line_off = (y as u32 * pitch) as usize;
-        //
+            let mut dest_line_off = (y as u32 * size.width) as usize;
        // We do this in a bit of a convoluted way (but it's also zero allocation!)
        if from_opengl {
            for y in (0..size.height).rev() {
                let src_line_off = (y as u32 * pitch) as usize;
                let src_slice = &slice[src_line_off..src_line_off + size.width as usize];
-                let reversed_y = (size.height - 1) - y;
+            // copy only
-
+            if has_disconnected_pitch {
-                let src_line_off = (reversed_y as u32 * pitch) as usize;
+                dest_line_off = (y * pitch) as usize;
                let mut dest_line_off = src_line_off;
                // copy only
                if has_disconnected_pitch {
                    dest_line_off = (reversed_y * pitch.min(size.width)) as usize;
                }
                let dest_slice = &mut buffer[dest_line_off..dest_line_off + size.width as usize];
                dest_slice.copy_from_slice(src_slice);
            }
        } else {
            for y in 0..size.height {
                let src_line_off = (y as u32 * pitch) as usize;
                let mut dest_line_off = src_line_off;
-                // copy only
+            // Create slices repressenting each part
-                if has_disconnected_pitch {
+            let src_slice = &slice[src_line_off..src_line_off + size.width as usize];
-                    dest_line_off = (y * size.width) as usize;
+            let dest_slice = &mut buffer[dest_line_off..dest_line_off + size.width as usize];
                }
-                // Create slices repressenting each part
+            dest_slice.copy_from_slice(src_slice);
                let src_slice = &slice[src_line_off..src_line_off + size.width as usize];
                let dest_slice = &mut framebuffer_locked.get_buffer()
                    [dest_line_off..dest_line_off + size.width as usize];
                dest_slice.copy_from_slice(src_slice);
            }
        }
    }
 }
@ -204,21 +207,32 @@ impl FrontendInterface for RetroState {
    fn video_resize(&mut self, width: u32, height: u32) {
        tracing::info!("Resized to {width}x{height}");
-        if self.egl_context.is_some() {
+        self.gl_framebuffer.resize(width, height);
-            self.gl_framebuffer.resize(width, height);
+        let raw = self.gl_framebuffer.as_raw();
            let raw = self.gl_framebuffer.as_raw();
-            // Notify the frontend layer about the new FBO ID
+        // Notify the frontend layer about the new FBO ID
-            self.get_frontend().set_gl_fbo(raw);
+        self.get_frontend().set_gl_fbo(raw);
-            // Resize the readback buffer
+        if !self.gl_rendering {
-            self.readback_buffer.resize(Size { width, height });
+            self.software_framebuffer.resize(Size { width, height });
        }
-        self.framebuffer
+        unsafe {
-            .lock()
+        }
-            .expect("its over?")
+
-            .resize(Size { width, height });
+        // map to cuda
        {
            let mut locked = self
                .cuda_resource
                .lock()
                .expect("YOU MOTHERFUCKER PISS GO COUNT YOUR DICK");
            locked.device().bind_to_thread().expect("fuck");
            locked
                .register(self.gl_framebuffer.texture_id(), gl::TEXTURE_2D)
                .expect("you fucking asswater");
        }
        let _ = self.event_tx.blocking_send(RetroEvent::Resize {
            size: Size { width, height },
@ -226,39 +240,29 @@ impl FrontendInterface for RetroState {
    }
    fn video_update(&mut self, slice: &[u32], pitch: u32) {
-        Self::update_impl(self.framebuffer.clone(), slice, pitch, false);
+        Self::update_software_framebuffer(&mut self.software_framebuffer, slice, pitch);
        let size = self.software_framebuffer.size.clone();
        // upload texture to GPU
        unsafe {
            gl::TexImage2D(
                gl::TEXTURE_2D,
                0,
                gl::RGBA as i32,
                size.width as i32,
                size.height as i32,
                0,
                gl::RGBA_INTEGER,
                gl::UNSIGNED_INT_8_8_8_8,
                self.software_framebuffer.get_buffer().as_mut_ptr() as *const _,
            );
        }
        let _ = self.event_tx.blocking_send(RetroEvent::Frame);
    }
    fn video_update_gl(&mut self) {
        let dimensions = self.get_frontend().get_size();
        // Read back the framebuffer
        let slice = {
            // lame, doesn't do bgra conversion for us
            //self.gl_framebuffer.read_pixels(
            //    &mut self.readback_buffer.get_buffer()[..],
            //    dimensions.0,
            //    dimensions.1,
            //);
            unsafe {
                let _bind = self.gl_framebuffer.bind();
                gl::ReadPixels(
                    0,
                    0,
                    dimensions.0 as i32,
                    dimensions.1 as i32,
                    gl::BGRA,
                    gl::UNSIGNED_BYTE,
                    (&mut self.readback_buffer.get_buffer()).as_mut_ptr() as *mut std::ffi::c_void,
                );
            }
            self.readback_buffer.get_buffer()
        };
        Self::update_impl(self.framebuffer.clone(), slice, dimensions.0, true);
        let _ = self.event_tx.blocking_send(RetroEvent::Frame);
    }
@ -270,9 +274,13 @@ impl FrontendInterface for RetroState {
        let (tx, rx) = oneshot::channel();
        let _ = self.event_tx.blocking_send(RetroEvent::WantInputs { tx });
-        let inputs = rx.blocking_recv().expect("what the FUCK are you doing");
+        let inputs = rx.blocking_recv();
-        for key in &inputs {
+        if inputs.is_err() {
            return;
        }
        for key in &inputs.unwrap() {
            use xkeysym::key as Key;
            match *key {
@ -349,38 +357,23 @@ impl FrontendInterface for RetroState {
    fn hw_gl_init(&mut self) -> Option<HwGlInitData> {
        // Only create a new EGL/OpenGL context if we have to.
-        if self.egl_context.is_none() {
+        let context = self.egl_context.lock().expect("fuck you!");
-            // Initalize EGL
+        let extensions = gpu::egl_helpers::get_extensions(context.get_display());
            self.hw_gl_egl_init();
-            let context = self.egl_context.as_ref().unwrap();
+        tracing::debug!("Supported EGL extensions: {:?}", extensions);
            let extensions = gpu::egl_helpers::get_extensions(context.get_display());
-            tracing::debug!("Supported EGL extensions: {:?}", extensions);
+        // Check for EGL_KHR_get_all_proc_addresses, so we can use eglGetProcAddress() to load OpenGL functions
-
+        if !extensions.contains(&"EGL_KHR_get_all_proc_addresses".into()) {
-            // Check for EGL_KHR_get_all_proc_addresses, so we can use eglGetProcAddress() to load OpenGL functions
+            tracing::error!("Your graphics driver doesn't support the EGL_KHR_get_all_proc_addresses extension.");
-            if !extensions.contains(&"EGL_KHR_get_all_proc_addresses".into()) {
+            tracing::error!("Retrodemo currently needs this to load OpenGL functions. HW rendering will be disabled.");
-                tracing::error!("Your graphics driver doesn't support the EGL_KHR_get_all_proc_addresses extension.");
+            return None;
                tracing::error!("Retrodemo currently needs this to load OpenGL functions. HW rendering will be disabled.");
                return None;
            }
            unsafe {
                // Load OpenGL functions using the EGL loader.
                gl::load_with(|s| {
                    let str = std::ffi::CString::new(s).expect("gl::load_with fail");
                    std::mem::transmute(gpu::egl::GetProcAddress(str.as_ptr()))
                });
                // set OpenGL debug message callback
                gl::Enable(gl::DEBUG_OUTPUT);
                gl::DebugMessageCallback(Some(opengl_message_callback), std::ptr::null());
            }
        }
-        // Create the initial FBO for the core to render to
+        // If we get here, we can be certain that we're no longer
-        let dimensions = self.get_frontend().get_size();
+        // going to use software rendering. Therefore, we can
-        self.gl_framebuffer.resize(dimensions.0, dimensions.1);
+        // clear (free) the software framebuffer, since it won't be of use to us anymore.
        self.software_framebuffer.clear();
        self.gl_rendering = true;
        return Some(HwGlInitData {
            get_proc_address: gpu::egl::GetProcAddress as *mut std::ffi::c_void,
@ -403,25 +396,34 @@ pub enum RetroEvent {
 pub enum RetroInEvent {
    Start,
    LoadCore(std::path::PathBuf),
    LoadGame(std::path::PathBuf),
 }
 fn retro_thread_main(
-    surface: Arc<Mutex<Surface>>,
+    context: &Arc<Mutex<DeviceContext>>,
    resource: &Arc<Mutex<GraphicsResource>>,
    event_tx: mpsc::Sender<RetroEvent>,
    mut event_rx: mpsc::Receiver<RetroInEvent>,
 ) {
-    let mut app = RetroState::new(surface, event_tx);
+    let mut app = RetroState::new(context.clone(), resource.clone(), event_tx);
-    app.load_core("cores/swanstation_libretro.so")
+    // do EGL init first
-        .expect("failed to load core");
+    app.hw_gl_egl_init();
    app.load_game("roms/merged/nmv3/us/nmv3_us.cue") //merged/nmv1/us/nmv1_us.cue
        .expect("failed to load game");
-    // sync
+    // pre-setup
    loop {
        match event_rx.blocking_recv() {
            None => return (),
            Some(msg) => match msg {
                RetroInEvent::LoadCore(path) => {
                    app.load_core(path).expect("Failed to load core.");
                }
                RetroInEvent::LoadGame(path) => {
                    app.load_game(path).expect("Failed to load game!");
                }
                RetroInEvent::Start => break,
            },
        }
@ -435,18 +437,23 @@ fn retro_thread_main(
 }
 pub fn spawn_retro_thread(
-    surface: Arc<Mutex<Surface>>,
+    context: Arc<Mutex<DeviceContext>>,
    resource: Arc<Mutex<GraphicsResource>>,
 ) -> (mpsc::Receiver<RetroEvent>, mpsc::Sender<RetroInEvent>) {
-    let (event_tx, event_rx) = mpsc::channel(8);
+    // essentially semaphores
-    let (event_in_tx, event_in_rx) = mpsc::channel(8);
+    let (event_tx, event_rx) = mpsc::channel(1);
-    let fb_clone = surface.clone();
+    let (event_in_tx, event_in_rx) = mpsc::channel(1);
    let cloned = resource.clone();
    let ctxcloned = context.clone();
    // discard the join handle
    let _ = std::thread::Builder::new()
        .name("retro_game".into())
        .spawn(move || {
-            retro_thread_main(fb_clone, event_tx, event_in_rx);
+            retro_thread_main(&ctxcloned, &cloned, event_tx, event_in_rx);
-        }).expect("failed to spawn the game thread");
+        })
        .expect("failed to spawn the game thread");
    (event_rx, event_in_tx)
 }
--- a/server/src/surface.rs
+++ b/server/src/surface.rs
@ -21,7 +21,7 @@ pub fn alloc_boxed_slice<T: Sized>(len: usize) -> Box<[T]> {
    unsafe { Box::from_raw(slice) }
 }
-/// A BGRA-format surface.
+/// A BGRA (or RGBA, I'm not your dad. There are funtions that assume the former though) format surface.
 pub struct Surface {
    buffer: Option<Box<[u32]>>,
    pub size: Size,
@ -38,10 +38,18 @@ impl Surface {
        }
    }
    pub fn clear(&mut self) {
        self.buffer = None;
        self.size = Size {
            width: 0,
            height: 0,
        };
    }
    pub fn resize(&mut self, size: Size) {
        self.size = size;
-		self.buffer = Some(alloc_boxed_slice(self.size.linear()));
+        self.buffer = Some(alloc_boxed_slice(self.size.linear()));
    }
    pub fn get_buffer(&mut self) -> &mut [u32] {
@ -53,15 +61,14 @@ impl Surface {
    pub fn blit_buffer(&mut self, src_at: Rect, data: &[u32]) {
        let mut off = 0;
-		let buf = self.buffer.as_mut().unwrap();
+        let buf = self.buffer.as_mut().unwrap();
-		let buf_slice = &mut *buf;
+        let buf_slice = &mut *buf;
        for y in src_at.y..src_at.y + src_at.height {
            let src = &data[off..off + src_at.width as usize];
            let dest_start_offset = (y as usize * self.size.width as usize) + src_at.x as usize;
-            let dest =
+            let dest = &mut buf_slice[dest_start_offset..dest_start_offset + src_at.width as usize];
                &mut buf_slice[dest_start_offset..dest_start_offset + src_at.width as usize];
            // This forces alpha to always be 0xff. I *could* probably do this in a clearer way though :(
            for (dest, src_item) in dest.iter_mut().zip(src.iter()) {
--- a/server/src/video/cuda_gl/bindgen.sh
+++ b/server/src/video/cuda_gl/bindgen.sh
@ -0,0 +1,27 @@
 #!/bin/bash
 # Does bindgen for CUDA cudaGL. (needs postprocessing)
 set -exu
 #  --allowlist-type="^CU.*" \
 #   --allowlist-type="^cuuint(32|64)_t" \
 #   --allowlist-type="^cudaError_enum" \
 #   --allowlist-type="^cu.*Complex$" \
 #   --allowlist-type="^cuda.*" \
 #   --allowlist-type="^libraryPropertyType.*" \
 #   --allowlist-var="^CU.*" \
 echo "use cudarc::sys::*; /* Hack :3 */" > ./sys.rs
 bindgen \
  --allowlist-type="" \
  --allowlist-function="^cuGraphicsGL.*" \
  --default-enum-style=rust \
  --no-doc-comments \
  --with-derive-default \
  --with-derive-eq \
  --with-derive-hash \
  --with-derive-ord \
  --use-core \
  --dynamic-loading Lib \
  gl.h -- -I/opt/cuda/include \
  >> ./sys.rs
--- a/server/src/video/cuda_gl/gl.h
+++ b/server/src/video/cuda_gl/gl.h
@ -0,0 +1 @@
 #include "cudaGL.h"
--- a/server/src/video/cuda_gl/mod.rs
+++ b/server/src/video/cuda_gl/mod.rs
@ -0,0 +1,14 @@
 pub mod sys;
 use sys::*;
 pub mod safe;
 pub unsafe fn lib() -> &'static Lib {
    static LIB: std::sync::OnceLock<Lib> = std::sync::OnceLock::new();
    LIB.get_or_init(|| {
        if let Ok(lib) = Lib::new(libloading::library_filename("cuda")) {
            return lib;
        }
        panic!("cuda library doesn't exist.");
    })
 }
--- a/server/src/video/cuda_gl/safe.rs
+++ b/server/src/video/cuda_gl/safe.rs
@ -0,0 +1,135 @@
 use cudarc::driver::{result as cuda_result, safe as cuda_safe, sys as cuda_sys, CudaDevice};
 use super::sys;
 use std::sync::Arc;
 pub struct MappedGraphicsResource {
    resource: cuda_sys::CUgraphicsResource,
 }
 impl MappedGraphicsResource {
    fn new(resource: cuda_sys::CUgraphicsResource) -> Self {
        Self { resource }
    }
    pub fn map(&mut self) -> Result<(), cuda_result::DriverError> {
        unsafe {
            cuda_sys::lib()
                .cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut())
                .result()?;
        }
        Ok(())
    }
    pub fn unmap(&mut self) -> Result<(), cuda_result::DriverError> {
        unsafe {
            cuda_sys::lib()
                .cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut())
                .result()?;
        }
        Ok(())
    }
    pub fn get_mapped_array(&mut self) -> Result<cuda_sys::CUarray, cuda_result::DriverError> {
        assert!(
            !self.resource.is_null(),
            "do not call GraphicsResource::get_mapped_array if no resource is actually registered"
        );
        let mut array: cuda_sys::CUarray = std::ptr::null_mut();
        unsafe {
            cuda_sys::lib()
                .cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0)
                .result()?;
        }
        Ok(array)
    }
 }
 impl Drop for MappedGraphicsResource {
    fn drop(&mut self) {
        let _ = self.unmap();
    }
 }
 /// Wrapper over cuGraphicsGL* apis
 pub struct GraphicsResource {
    context: Arc<CudaDevice>,
    resource: cuda_sys::CUgraphicsResource,
 }
 impl GraphicsResource {
    pub fn new(device: &Arc<CudaDevice>) -> Self {
        Self {
            context: device.clone(),
            resource: std::ptr::null_mut(),
        }
    }
    pub fn device(&self) -> Arc<CudaDevice> {
        self.context.clone()
    }
    /// Maps this resource.
    pub fn map(&mut self) -> Result<MappedGraphicsResource, cuda_result::DriverError> {
        let mut res = MappedGraphicsResource::new(self.resource);
        res.map()?;
        Ok(res)
    }
    pub fn register(
        &mut self,
        texture_id: gl::types::GLuint,
        texture_kind: gl::types::GLuint,
    ) -> Result<(), cuda_result::DriverError> {
        // better to be safe than leak memory? idk.
        if !self.resource.is_null() {
            self.unregister()?;
        }
        unsafe {
            super::lib()
                .cuGraphicsGLRegisterImage(&mut self.resource, texture_id, texture_kind, 1)
                .result()?;
        }
        Ok(())
    }
    pub fn is_registered(&self) -> bool {
        !self.resource.is_null()
    }
    pub fn unregister(&mut self) -> Result<(), cuda_result::DriverError> {
        assert!(
            !self.resource.is_null(),
            "do not call if no resource is actually registered"
        );
        unsafe {
            cuda_sys::lib()
                .cuGraphicsUnregisterResource(self.resource)
                .result()?;
        }
        self.resource = std::ptr::null_mut();
        Ok(())
    }
 }
 impl Drop for GraphicsResource {
    fn drop(&mut self) {
        if self.is_registered() {
            let _ = self.unregister();
        }
    }
 }
 unsafe impl Send for GraphicsResource {}
--- a/server/src/video/cuda_gl/sys.rs
+++ b/server/src/video/cuda_gl/sys.rs
@ -0,0 +1,73 @@
 use cudarc::driver::sys::*; /* Hack :3 */
 use gl::types::{GLenum, GLuint};
 /* automatically generated by rust-bindgen 0.69.4 */
 pub struct Lib {
    __library: ::libloading::Library,
    pub cuGraphicsGLRegisterBuffer: Result<
        unsafe extern "C" fn(
            pCudaResource: *mut CUgraphicsResource,
            buffer: GLuint,
            Flags: ::core::ffi::c_uint,
        ) -> CUresult,
        ::libloading::Error,
    >,
    pub cuGraphicsGLRegisterImage: Result<
        unsafe extern "C" fn(
            pCudaResource: *mut CUgraphicsResource,
            image: GLuint,
            target: GLenum,
            Flags: ::core::ffi::c_uint,
        ) -> CUresult,
        ::libloading::Error,
    >,
 }
 impl Lib {
    pub unsafe fn new<P>(path: P) -> Result<Self, ::libloading::Error>
    where
        P: AsRef<::std::ffi::OsStr>,
    {
        let library = ::libloading::Library::new(path)?;
        Self::from_library(library)
    }
    pub unsafe fn from_library<L>(library: L) -> Result<Self, ::libloading::Error>
    where
        L: Into<::libloading::Library>,
    {
        let __library = library.into();
        let cuGraphicsGLRegisterBuffer = __library
            .get(b"cuGraphicsGLRegisterBuffer\0")
            .map(|sym| *sym);
        let cuGraphicsGLRegisterImage = __library
            .get(b"cuGraphicsGLRegisterImage\0")
            .map(|sym| *sym);
        Ok(Lib {
            __library,
            cuGraphicsGLRegisterBuffer,
            cuGraphicsGLRegisterImage,
        })
    }
    pub unsafe fn cuGraphicsGLRegisterBuffer(
        &self,
        pCudaResource: *mut CUgraphicsResource,
        buffer: GLuint,
        Flags: ::core::ffi::c_uint,
    ) -> CUresult {
        (self
            .cuGraphicsGLRegisterBuffer
            .as_ref()
            .expect("Expected function, got error."))(pCudaResource, buffer, Flags)
    }
    pub unsafe fn cuGraphicsGLRegisterImage(
        &self,
        pCudaResource: *mut CUgraphicsResource,
        image: GLuint,
        target: GLenum,
        Flags: ::core::ffi::c_uint,
    ) -> CUresult {
        (self
            .cuGraphicsGLRegisterImage
            .as_ref()
            .expect("Expected function, got error."))(pCudaResource, image, target, Flags)
    }
 }
--- a/server/src/video/encoder_thread.rs
+++ b/server/src/video/encoder_thread.rs
@ -1,11 +1,16 @@
 use cudarc::driver::{
    sys::{CUdeviceptr, CUmemorytype},
    CudaDevice, LaunchAsync,
 };
 use letsplay_gpu::egl_helpers::DeviceContext;
 use std::{
    sync::{Arc, Mutex},
    time::Duration,
 };
 use tokio::sync::mpsc::{self, error::TryRecvError};
 use super::ffmpeg;
 use super::h264_encoder::H264Encoder;
 use super::{cuda_gl::safe::GraphicsResource, ffmpeg};
 pub enum EncodeThreadInput {
    Init { size: crate::types::Size },
@ -37,7 +42,7 @@ impl EncoderState {
        self.encoder = Some(H264Encoder::new_nvenc_swframe(
            size.clone(),
            60,
-            3 * (1024 * 1024),
+            2 * (1024 * 1024),
        )?);
        // replace packet
@ -85,7 +90,76 @@ impl EncoderState {
    }
 }
-fn encoder_thread_main(
+struct EncoderStateHW {
    encoder: Option<H264Encoder>,
    frame: ffmpeg::frame::Video,
    packet: ffmpeg::Packet,
 }
 impl EncoderStateHW {
    fn new() -> Self {
        Self {
            encoder: None,
            frame: ffmpeg::frame::Video::empty(),
            packet: ffmpeg::Packet::empty(),
        }
    }
    fn init(&mut self, device: &Arc<CudaDevice>, size: crate::types::Size) -> anyhow::Result<()> {
        self.encoder = Some(H264Encoder::new_nvenc_hwframe(
            &device,
            size.clone(),
            60,
            2 * (1024 * 1024),
        )?);
        // replace packet
        self.packet = ffmpeg::Packet::empty();
        self.frame = self.encoder.as_mut().unwrap().create_frame()?;
        Ok(())
    }
    #[inline]
    fn frame(&mut self) -> &mut ffmpeg::frame::Video {
        &mut self.frame
    }
    fn send_frame(&mut self, pts: u64, force_keyframe: bool) -> Option<ffmpeg::Packet> {
        let frame = &mut self.frame;
        let encoder = self.encoder.as_mut().unwrap();
        // set frame metadata
        unsafe {
            if force_keyframe {
                (*frame.as_mut_ptr()).pict_type = ffmpeg::sys::AVPictureType::AV_PICTURE_TYPE_I;
                (*frame.as_mut_ptr()).flags = ffmpeg::sys::AV_FRAME_FLAG_KEY;
                (*frame.as_mut_ptr()).key_frame = 1;
            } else {
                (*frame.as_mut_ptr()).pict_type = ffmpeg::sys::AVPictureType::AV_PICTURE_TYPE_NONE;
                (*frame.as_mut_ptr()).flags = 0i32;
                (*frame.as_mut_ptr()).key_frame = 0;
            }
            (*frame.as_mut_ptr()).pts = pts as i64;
        }
        encoder.send_frame(&*frame);
        encoder
            .receive_packet(&mut self.packet)
            .expect("Failed to recieve packet");
        unsafe {
            if !self.packet.is_empty() {
                return Some(self.packet.clone());
            }
        }
        return None;
    }
 }
 fn encoder_thread_swframe_main(
    mut rx: mpsc::Receiver<EncodeThreadInput>,
    tx: mpsc::Sender<EncodeThreadOutput>,
    frame: &Arc<Mutex<Option<ffmpeg::frame::Video>>>,
@ -143,7 +217,7 @@ fn encoder_thread_main(
    Ok(())
 }
-pub fn encoder_thread_spawn(
+pub fn encoder_thread_spawn_swframe(
    frame: &Arc<Mutex<Option<ffmpeg::frame::Video>>>,
 ) -> (
    mpsc::Receiver<EncodeThreadOutput>,
@ -154,7 +228,192 @@ pub fn encoder_thread_spawn(
    let clone = Arc::clone(frame);
-    std::thread::spawn(move || encoder_thread_main(in_rx, out_tx, &clone));
+    std::thread::spawn(move || encoder_thread_swframe_main(in_rx, out_tx, &clone));
    (out_rx, in_tx)
 }
 fn encoder_thread_hwframe_main(
    mut rx: mpsc::Receiver<EncodeThreadInput>,
    tx: mpsc::Sender<EncodeThreadOutput>,
    cuda_device: &Arc<CudaDevice>,
    cuda_resource: &Arc<Mutex<GraphicsResource>>,
    gl_context: &Arc<Mutex<DeviceContext>>,
 ) -> anyhow::Result<()> {
    let mut frame_number = 0u64;
    let mut force_keyframe = false;
    let mut encoder = EncoderStateHW::new();
    // :)
    cuda_device.bind_to_thread()?;
    /*
    let mut memcpy = cudarc::driver::sys::CUDA_MEMCPY2D_st::default();
    // setup basic src stuff
    memcpy.srcXInBytes = 0;
    memcpy.srcY = 0;
    memcpy.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
    // dest
    memcpy.dstXInBytes = 0;
    memcpy.dstY = 0;
    memcpy.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
     */
    loop {
        match rx.try_recv() {
            Ok(msg) => match msg {
                EncodeThreadInput::Init { size } => {
                    frame_number = 0;
                    if force_keyframe {
                        force_keyframe = false;
                    }
                    encoder
                        .init(cuda_device, size)
                        .expect("encoder init failed");
                }
                EncodeThreadInput::ForceKeyframe => {
                    force_keyframe = true;
                }
                EncodeThreadInput::SendFrame => {
                    // copy gl frame *ON THE GPU*
                    {
                        let mut gl_ctx = gl_context.lock().expect("you dumb fuck");
                        let mut gl_resource =
                            cuda_resource.lock().expect("couldnt lock GL resource!");
                        gl_ctx.make_current();
                        let mut mapped = gl_resource
                            .map()
                            .expect("couldnt map graphics resource. Its joever");
                        let array = mapped
                            .get_mapped_array()
                            .expect("well its all over anyways");
                        {
                            let frame = encoder.frame();
                            let width = frame.width();
                            let height = frame.height();
                            let launch_config = cudarc::driver::LaunchConfig {
                                grid_dim: (width / 16 + 1, height / 2 + 1, 0),
                                block_dim: (16, 2, 0),
                                shared_mem_bytes: 0,
                            };
                            let flip_opengl = cuda_device
                                .get_func("module", "flip_opengl")
                                .expect("dumb fucker");
                            unsafe {
                                let frame_ptr = frame.as_mut_ptr();
                                let mut params = [
                                    array as *mut std::ffi::c_void,
                                    (*frame_ptr).data[0] as CUdeviceptr as *mut std::ffi::c_void,
                                    width as *const u32 as *mut std::ffi::c_void,
                                    height as *const u32 as *mut std::ffi::c_void,
                                ];
                                flip_opengl
                                    .launch(launch_config, &mut params[..])
                                    .expect("oh its done");
                            cudarc::driver::sys::lib()
                            .cuStreamSynchronize(std::ptr::null_mut())
                            .result()
                            .expect("you banned");
                            }
                        }
                        /*
                        // setup the cuMemcpy2D
                        {
                            let frame = encoder.frame();
                            memcpy.srcArray = array;
                            unsafe {
                                let frame_ptr = frame.as_mut_ptr();
                                memcpy.dstDevice = (*frame_ptr).data[0] as CUdeviceptr;
                                memcpy.dstPitch = (*frame_ptr).linesize[0] as usize;
                                memcpy.WidthInBytes = ((*frame_ptr).width * 4) as usize;
                                memcpy.Height = (*frame_ptr).height as usize;
                            }
                        }
                        unsafe {
                            cudarc::driver::sys::lib()
                                .cuMemcpy2DAsync_v2(&memcpy, std::ptr::null_mut())
                                .result()
                                .expect("cuMemcpy2D fail epic");
                            cudarc::driver::sys::lib()
                                .cuStreamSynchronize(std::ptr::null_mut())
                                .result()
                                .expect("you banned");
                        }                    */
                        mapped.unmap().expect("fuck you asshole");
                        gl_ctx.release();
                    }
                    if let Some(pkt) = encoder.send_frame(frame_number as u64, force_keyframe) {
                        // A bit less clear than ::empty(), but it's "Safe"
                        if let Some(_) = pkt.data() {
                            let _ = tx.blocking_send(EncodeThreadOutput::Frame {
                                packet: pkt.clone(),
                            });
                        }
                        frame_number += 1;
                    }
                    if force_keyframe {
                        force_keyframe = false;
                    }
                }
            },
            Err(TryRecvError::Disconnected) => break,
            Err(TryRecvError::Empty) => {
                std::thread::sleep(Duration::from_millis(1));
            }
        }
    }
    Ok(())
 }
 pub fn encoder_thread_spawn_hwframe(
    cuda_device: &Arc<CudaDevice>,
    cuda_resource: &Arc<Mutex<GraphicsResource>>,
    gl_context: &Arc<Mutex<DeviceContext>>,
 ) -> (
    mpsc::Receiver<EncodeThreadOutput>,
    mpsc::Sender<EncodeThreadInput>,
 ) {
    let (in_tx, in_rx) = mpsc::channel(1);
    let (out_tx, out_rx) = mpsc::channel(1);
    let dev_clone = Arc::clone(cuda_device);
    let rsrc_clone = Arc::clone(cuda_resource);
    let gl_clone = Arc::clone(gl_context);
    std::thread::spawn(move || {
        encoder_thread_hwframe_main(in_rx, out_tx, &dev_clone, &rsrc_clone, &gl_clone)
    });
    (out_rx, in_tx)
 }
--- a/server/src/video/h264_encoder.rs
+++ b/server/src/video/h264_encoder.rs
@ -135,8 +135,8 @@ impl H264Encoder {
        video_encoder_context.set_format(ffmpeg::format::Pixel::ZRGB32);
-        video_encoder_context.set_qmin(38);
+        video_encoder_context.set_qmin(37);
-        video_encoder_context.set_qmax(32);
+        video_encoder_context.set_qmax(33);
        // set h264_nvenc options
        let mut dict = ffmpeg::Dictionary::new();
@ -148,7 +148,7 @@ impl H264Encoder {
        // TODO:
        dict.set("rc", "vbr");
-        dict.set("qp", "45");
+        dict.set("qp", "35");
        dict.set("forced-idr", "1");
@ -169,9 +169,6 @@ impl H264Encoder {
        max_framerate: u32,
        bitrate: usize,
    ) -> anyhow::Result<Self> {
        /*
        (See FIXMEs above)
        let cuda_device_context = super::hwdevice::CudaDeviceContextBuilder::new()?
            .set_cuda_context((*cuda_device.cu_primary_ctx()) as *mut _)
            .build()
@ -180,14 +177,53 @@ impl H264Encoder {
        let mut hw_frame_context = super::hwframe::HwFrameContextBuilder::new(cuda_device_context)?
            .set_width(size.width)
            .set_height(size.height)
-            .set_sw_format(ffmpeg::format::Pixel::ZRGB32)
+            .set_sw_format(ffmpeg::format::Pixel::ZBGR32)
            .set_format(ffmpeg::format::Pixel::CUDA)
            .build()
            .with_context(|| "while trying to create CUDA frame context")?;
-         */
+        let (mut encoder, mut video_encoder_context) =
            create_context_and_set_common_parameters("h264_nvenc", &size, max_framerate, bitrate)
                .with_context(|| "while trying to create encoder")?;
-        todo!("Implement me!");
+        video_encoder_context.set_format(ffmpeg::format::Pixel::CUDA);
        video_encoder_context.set_qmin(37);
        video_encoder_context.set_qmax(33);
        unsafe {
            // FIXME: this currently breaks the avbufferref system a bit
            (*video_encoder_context.as_mut_ptr()).hw_frames_ctx = hw_frame_context.as_raw_mut();
            (*video_encoder_context.as_mut_ptr()).hw_device_ctx =
                hw_frame_context.as_device_context_mut();
        }
        // set h264_nvenc options
        let mut dict = ffmpeg::Dictionary::new();
        dict.set("tune", "ull");
        dict.set("preset", "p1");
        dict.set("profile", "main");
        // TODO:
        dict.set("rc", "vbr");
        dict.set("qp", "35");
        dict.set("forced-idr", "1");
        // damn you
        dict.set("delay", "0");
        dict.set("zerolatency", "1");
        let encoder = video_encoder_context
            .open_as_with(encoder, dict)
            .with_context(|| "While opening h264_nvenc video codec")?;
        Ok(Self::NvencHWFrame {
            encoder: encoder,
            hw_context: hw_frame_context,
        })
    }
    // NOTE: It's a bit pointless to have this have a mut borrow,
@ -207,46 +243,44 @@ impl H264Encoder {
    //    }
    //}
-    pub fn create_frame(&mut self) -> ffmpeg::frame::Video {
+    pub fn create_frame(&mut self) -> anyhow::Result<ffmpeg::frame::Video> {
        match self {
            Self::Software { encoder } | Self::NvencSWFrame { encoder } => {
-                return ffmpeg::frame::Video::new(encoder.format(), encoder.width(), encoder.height());
+                return Ok(ffmpeg::frame::Video::new(
                    encoder.format(),
                    encoder.width(),
                    encoder.height(),
                ));
            }
            Self::NvencHWFrame {
                encoder,
                hw_context,
            } => {
-                todo!("Implement H264Encoder::create_frame() for NvencHWFrame!");
+                let mut frame = ffmpeg::frame::Video::empty();
                unsafe {
                    (*frame.as_mut_ptr()).format = ffmpeg::format::Pixel::CUDA as i32;
                    (*frame.as_mut_ptr()).width = encoder.width() as i32;
                    (*frame.as_mut_ptr()).height = encoder.height() as i32;
                    (*frame.as_mut_ptr()).hw_frames_ctx = hw_context.as_raw_mut();
                    //ffmpeg::sys::av_frame_get_buffer(frame.as_mut_ptr(), 32);
                    hw_context.get_buffer(&mut frame)?;
                    hw_context.get_buffer(&mut frame)?;
                    (*frame.as_mut_ptr()).linesize[0] = (*frame.as_ptr()).width * 4;
                    return Ok(frame);
                }
            }
        }
        /*
         unsafe {
            let mut frame = ffmpeg::Frame::empty();
            (*frame.as_mut_ptr()).format = pixel_format as i32;
            (*frame.as_mut_ptr()).width = width as i32;
            (*frame.as_mut_ptr()).height = height as i32;
            (*frame.as_mut_ptr()).hw_frames_ctx = context.as_raw_mut();
            super::check_ret(ffmpeg::sys::av_hwframe_get_buffer(
                context.as_raw_mut(),
                frame.as_mut_ptr(),
                0,
            ))?;
            super::check_ret(ffmpeg::sys::av_hwframe_get_buffer(
                context.as_raw_mut(),
                frame.as_mut_ptr(),
                0,
            ))?;
-            (*frame.as_mut_ptr()).linesize[0] = (*frame.as_ptr()).width * 4;
+        */
            Ok(frame)
        }
         */
        todo!("FIXME");
    }
@ -267,7 +301,8 @@ impl H264Encoder {
                encoder,
                hw_context,
            } => {
-                todo!("Implement send_frame() for NvencHWFrame");
+                //todo!("Implement send_frame() for NvencHWFrame");
                encoder.send_frame(frame).unwrap();
            }
        }
    }
--- a/server/src/video/hwframe.rs
+++ b/server/src/video/hwframe.rs
@ -7,82 +7,103 @@ use ffmpeg::format::Pixel;
 use super::{check_ret, hwdevice::CudaDeviceContext};
 pub struct HwFrameContext {
-	_cuda_device_context: CudaDeviceContext,
+    _cuda_device_context: CudaDeviceContext,
-	buffer: *mut ffmpeg::sys::AVBufferRef,
+    buffer: *mut ffmpeg::sys::AVBufferRef,
 }
 impl HwFrameContext {
-	fn new(cuda_device_context: CudaDeviceContext, buffer: *mut ffmpeg::sys::AVBufferRef) -> Self {
+    fn new(cuda_device_context: CudaDeviceContext, buffer: *mut ffmpeg::sys::AVBufferRef) -> Self {
-		Self { _cuda_device_context: cuda_device_context, buffer }
+        Self {
-	}
+            _cuda_device_context: cuda_device_context,
            buffer,
        }
    }
-	// pub fn as_context_mut(&mut self) -> &mut ffmpeg::sys::AVHWFramesContext {
+    // pub fn as_context_mut(&mut self) -> &mut ffmpeg::sys::AVHWFramesContext {
-	// 	unsafe { &mut *((*self.buffer).data as *mut ffmpeg::sys::AVHWFramesContext) }
+    // 	unsafe { &mut *((*self.buffer).data as *mut ffmpeg::sys::AVHWFramesContext) }
-	// }
+    // }
-	// pub fn as_context(&self) -> &ffmpeg::sys::AVHWFramesContext {
+    // pub fn as_context(&self) -> &ffmpeg::sys::AVHWFramesContext {
-	// 	unsafe { &*((*self.buffer).data as *const ffmpeg::sys::AVHWFramesContext) }
+    // 	unsafe { &*((*self.buffer).data as *const ffmpeg::sys::AVHWFramesContext) }
-	// }
+    // }
-	pub fn as_raw_mut(&mut self) -> &mut ffmpeg::sys::AVBufferRef {
+    pub fn as_raw_mut(&mut self) -> &mut ffmpeg::sys::AVBufferRef {
-		unsafe { &mut *self.buffer }
+        unsafe { &mut *self.buffer }
-	}
+    }
-	// pub fn as_raw(&self) -> &ffmpeg::sys::AVBufferRef {
+    pub fn as_device_context_mut(&mut self) -> &mut ffmpeg::sys::AVBufferRef {
-	// 	unsafe { &*self.buffer }
+        unsafe {
-	// }
+            self._cuda_device_context.as_raw_mut()
        }
    }
 	/// call once to allocate frame
    pub fn get_buffer(&mut self, frame: &mut ffmpeg::frame::Video) -> Result<(), ffmpeg::Error> {
        unsafe {
            super::check_ret(ffmpeg::sys::av_hwframe_get_buffer(self.buffer, frame.as_mut_ptr(), 0))?;
        }
 		Ok(())
    }
    // pub fn as_raw(&self) -> &ffmpeg::sys::AVBufferRef {
    // 	unsafe { &*self.buffer }
    // }
 }
-unsafe impl Send for HwFrameContext { }
+unsafe impl Send for HwFrameContext {}
 pub struct HwFrameContextBuilder {
-	cuda_device_context: CudaDeviceContext,
+    cuda_device_context: CudaDeviceContext,
-	buffer: *mut ffmpeg::sys::AVBufferRef,
+    buffer: *mut ffmpeg::sys::AVBufferRef,
 }
 impl HwFrameContextBuilder {
-	pub fn new(mut cuda_device_context: CudaDeviceContext) -> anyhow::Result<Self> {
+    pub fn new(mut cuda_device_context: CudaDeviceContext) -> anyhow::Result<Self> {
-		let buffer = unsafe { ffmpeg::sys::av_hwframe_ctx_alloc(cuda_device_context.as_raw_mut()) };
+        let buffer = unsafe { ffmpeg::sys::av_hwframe_ctx_alloc(cuda_device_context.as_raw_mut()) };
-		if buffer.is_null() {
+        if buffer.is_null() {
-			return Err(anyhow::anyhow!("could not allocate a hwframe context"));
+            return Err(anyhow::anyhow!("could not allocate a hwframe context"));
-		}
+        }
-		Ok(Self { cuda_device_context, buffer })
+        Ok(Self {
-	}
+            cuda_device_context,
            buffer,
        })
    }
-	pub fn build(mut self) -> Result<HwFrameContext, ffmpeg::Error> {
+    pub fn build(mut self) -> Result<HwFrameContext, ffmpeg::Error> {
-		check_ret(unsafe { ffmpeg::sys::av_hwframe_ctx_init(self.buffer) })?;
+        check_ret(unsafe { ffmpeg::sys::av_hwframe_ctx_init(self.buffer) })?;
-		let result = Ok(HwFrameContext::new(self.cuda_device_context, self.buffer));
+        let result = Ok(HwFrameContext::new(self.cuda_device_context, self.buffer));
-		self.buffer = null_mut();
+        self.buffer = null_mut();
-		result
+        result
-	}
+    }
-	pub fn set_width(mut self, width: u32) -> Self {
+    pub fn set_width(mut self, width: u32) -> Self {
-		self.as_frame_mut().width = width as i32;
+        self.as_frame_mut().width = width as i32;
-		self
+        self
-	}
+    }
-	pub fn set_height(mut self, height: u32) -> Self {
+    pub fn set_height(mut self, height: u32) -> Self {
-		self.as_frame_mut().height = height as i32;
+        self.as_frame_mut().height = height as i32;
-		self
+        self
-	}
+    }
-	pub fn set_sw_format(mut self, sw_format: Pixel) -> Self {
+    pub fn set_sw_format(mut self, sw_format: Pixel) -> Self {
-		self.as_frame_mut().sw_format = sw_format.into();
+        self.as_frame_mut().sw_format = sw_format.into();
-		self
+        self
-	}
+    }
-	pub fn set_format(mut self, format: Pixel) -> Self {
+    pub fn set_format(mut self, format: Pixel) -> Self {
-		self.as_frame_mut().format = format.into();
+        self.as_frame_mut().format = format.into();
-		self
+        self
-	}
+    }
-	pub fn as_frame_mut(&mut self) -> &mut ffmpeg::sys::AVHWFramesContext {
+    pub fn as_frame_mut(&mut self) -> &mut ffmpeg::sys::AVHWFramesContext {
-		unsafe { &mut *((*self.buffer).data as *mut ffmpeg::sys::AVHWFramesContext) }
+        unsafe { &mut *((*self.buffer).data as *mut ffmpeg::sys::AVHWFramesContext) }
-	}
+    }
-	// pub fn as_frame(&self) -> &ffmpeg::sys::AVHWFramesContext {
+    // pub fn as_frame(&self) -> &ffmpeg::sys::AVHWFramesContext {
-	// 	unsafe { &*((*self.buffer).data as *const ffmpeg::sys::AVHWFramesContext) }
+    // 	unsafe { &*((*self.buffer).data as *const ffmpeg::sys::AVHWFramesContext) }
-	// }
+    // }
 }
--- a/server/src/video/mod.rs
+++ b/server/src/video/mod.rs
@ -9,6 +9,8 @@ pub mod hwframe;
 pub mod encoder_thread;
 pub mod cuda_gl;
 // from hgaiser/moonshine
 pub fn check_ret(error_code: i32) -> Result<(), ffmpeg::Error> {
 	if error_code != 0 {