From 09142f9668d1d90e15126c3989a7a93f97fa5aa2 Mon Sep 17 00:00:00 2001
From: modeco80 <lily.modeco80@protonmail.ch>
Date: Mon, 14 Oct 2024 07:25:59 -0400
Subject: [PATCH] working gpu-only encode

---
 server/Cargo.lock                   |   1 +
 server/Cargo.toml                   |   1 +
 server/src/main.rs                  | 128 +++++++++----
 server/src/retro_thread.rs          | 283 ++++++++++++++--------------
 server/src/surface.rs               |  23 ++-
 server/src/video/cuda_gl/bindgen.sh |  27 +++
 server/src/video/cuda_gl/gl.h       |   1 +
 server/src/video/cuda_gl/mod.rs     |  14 ++
 server/src/video/cuda_gl/safe.rs    | 135 +++++++++++++
 server/src/video/cuda_gl/sys.rs     |  73 +++++++
 server/src/video/encoder_thread.rs  | 269 +++++++++++++++++++++++++-
 server/src/video/h264_encoder.rs    | 107 +++++++----
 server/src/video/hwframe.rs         | 133 +++++++------
 server/src/video/mod.rs             |   2 +
 14 files changed, 920 insertions(+), 277 deletions(-)
 create mode 100755 server/src/video/cuda_gl/bindgen.sh
 create mode 100644 server/src/video/cuda_gl/gl.h
 create mode 100644 server/src/video/cuda_gl/mod.rs
 create mode 100644 server/src/video/cuda_gl/safe.rs
 create mode 100644 server/src/video/cuda_gl/sys.rs
diff --git a/server/Cargo.lock b/server/Cargo.lock
index 38fd3a3..adb5d8e 100644
--- a/server/Cargo.lock
+++ b/server/Cargo.lock
@@ -1414,6 +1414,7 @@ dependencies = [
  "futures-util",
  "gl",
  "letsplay_gpu",
+ "libloading",
  "rand",
  "retro_frontend",
  "serde",
diff --git a/server/Cargo.toml b/server/Cargo.toml
index 11a6121..875e15a 100644
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@@ -32,6 +32,7 @@ tracing = "0.1.40"
 tracing-subscriber = "0.3.18"
 xkeysym = "0.2.1"
 async-trait = "0.1.83"
+libloading = "0.8.5"
 
 
 [patch.crates-io]
diff --git a/server/src/main.rs b/server/src/main.rs
index cd5f216..ee3bc01 100644
--- a/server/src/main.rs
+++ b/server/src/main.rs
@@ -5,10 +5,15 @@ mod video;
 
 mod transport;
 
+use anyhow::Context;
 use async_trait::async_trait;
 
+use cudarc::driver::CudaDevice;
+use letsplay_gpu::egl_helpers::DeviceContext;
 use retro_thread::{spawn_retro_thread, RetroEvent};
+use transport::websocket::WebsocketTransport;
 use transport::{Transport, TransportReciever};
+use video::cuda_gl::safe::GraphicsResource;
 use video::encoder_thread::EncodeThreadInput;
 use video::{encoder_thread, ffmpeg};
 
@@ -44,19 +49,19 @@ enum WsMessage {
     Json(String),
 }
 
-struct AppState {
+struct AppState<T: Transport> {
     encoder_tx: Arc<TokioMutex<mpsc::Sender<EncodeThreadInput>>>,
     inputs: Arc<TokioMutex<Vec<u32>>>,
 
-    transport: Arc<crate::transport::websocket::WebsocketTransport>,
+    transport: Arc<T>,
     connection_count: TokioMutex<usize>,
 }
 
-impl AppState {
-    fn new(
-        encoder_tx: mpsc::Sender<EncodeThreadInput>,
-        transport: Arc<crate::transport::websocket::WebsocketTransport>,
-    ) -> Self {
+impl<T> AppState<T>
+where
+    T: Transport + Send + Sync + 'static,
+{
+    fn new(encoder_tx: mpsc::Sender<EncodeThreadInput>, transport: Arc<T>) -> Self {
         Self {
             encoder_tx: Arc::new(TokioMutex::new(encoder_tx)),
             inputs: Arc::new(TokioMutex::new(Vec::new())),
@@ -67,7 +72,10 @@ impl AppState {
 }
 
 #[async_trait]
-impl TransportReciever for AppState {
+impl<T> TransportReciever for AppState<T>
+where
+    T: Transport + Send + Sync + 'static,
+{
     async fn on_connect(&self, username: &String) -> anyhow::Result<()> {
         println!("{username} joined!");
 
@@ -105,7 +113,6 @@ impl TransportReciever for AppState {
                         "msg": json["msg"].as_str().unwrap()
                     });
 
-
                     self.transport
                         .broadcast_message(transport::TransportMessage::Text(
                             serde_json::to_string(&send).expect("oh well"),
@@ -177,6 +184,23 @@ impl TransportReciever for AppState {
     }
 }
 
+const FLIP_SRC: &str = "
+extern \"C\" __global__ void flip_opengl(
+    const unsigned* pSrc,
+    unsigned* pDest,
+    unsigned width, 
+    unsigned height
+) {
+    const unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        // let reversed_y = (size.height - 1) - y;
+        unsigned reversed_y = (height - 1) - y;
+        pDest[y * width + x] = pSrc[reversed_y * width + x];
+    }
+}";
+
 #[tokio::main(flavor = "multi_thread", worker_threads = 2)]
 async fn main() -> anyhow::Result<()> {
     // Setup a tracing subscriber
@@ -186,36 +210,56 @@ async fn main() -> anyhow::Result<()> {
 
     tracing::subscriber::set_global_default(subscriber).unwrap();
 
-    let surface = Arc::new(Mutex::new(surface::Surface::new()));
-
     // H.264 encoder related
-    let frame: Arc<Mutex<Option<ffmpeg::frame::Video>>> = Arc::new(Mutex::new(None));
-    let (mut encoder_rx, encoder_tx) = encoder_thread::encoder_thread_spawn(&frame);
+    let device = CudaDevice::new(0)?;
 
-    let transport = Arc::new(crate::transport::websocket::WebsocketTransport::new());
+    let ptx = cudarc::nvrtc::compile_ptx(&FLIP_SRC).with_context(|| "compiling support kernel")?;
 
+    println!("compiled kernel");
+
+    // pop it in
+    device.load_ptx(ptx, "module", &["flip_opengl"])?;
+
+    let egl_ctx = Arc::new(Mutex::new(DeviceContext::new(0)));
+
+    let resource = Arc::new(Mutex::new(GraphicsResource::new(&device)));
+
+    let (mut encoder_rx, encoder_tx) = encoder_thread::encoder_thread_spawn_hwframe(
+        &device.clone(),
+        &resource.clone(),
+        &egl_ctx.clone(),
+    );
+
+    let transport = Arc::new(WebsocketTransport::new());
     let state = Arc::new(AppState::new(encoder_tx, transport.clone()));
 
-    let (mut event_rx, event_in_tx) = spawn_retro_thread(surface.clone());
+    let (mut retro_event_rx, retro_input_event_tx) =
+        spawn_retro_thread(egl_ctx.clone(), resource.clone());
 
     let state_clone = state.clone();
 
-    let transport_clone = transport.clone();
-
     // retro event handler. drives the encoder thread too
     let _ = std::thread::Builder::new()
         .name("retro_event_rx".into())
         .spawn(move || {
-            let surface_clone = surface.clone();
-            let frame_clone = frame.clone();
+            // load game
+            let _ = retro_input_event_tx.blocking_send(retro_thread::RetroInEvent::LoadCore(
+                "cores/swanstation_libretro.so".into(),
+            ));
+            let _ = retro_input_event_tx.blocking_send(retro_thread::RetroInEvent::LoadGame(
+                "roms/nmv1_us.cue".into(),
+            ));
 
             // start the libretro thread looping now that we're alive
-            let _ = event_in_tx.blocking_send(retro_thread::RetroInEvent::Start);
+            let _ = retro_input_event_tx.blocking_send(retro_thread::RetroInEvent::Start);
 
             loop {
-                match event_rx.try_recv() {
-                    Ok(msg) => match msg {
+                match retro_event_rx.blocking_recv() {
+                    Some(msg) => match msg {
                         RetroEvent::Frame => {
+                            /*
+                            let mut same = true;
+
                             {
                                 let mut frame_locked = frame.lock().expect(
                                     "Couldn't lock frame on our end. Did the encoder thread panic?",
@@ -229,6 +273,8 @@ async fn main() -> anyhow::Result<()> {
                                 let mut surf = surface_clone.lock().expect(
                                 "locking the VNC surface to paint it to the ffmpeg frame failed",
                             );
+
+
                                 let surf_buf = surf.get_buffer();
 
                                 let buf_ptr =
@@ -247,12 +293,27 @@ async fn main() -> anyhow::Result<()> {
                                         )
                                     };
 
+                                    // If any line differs then the frame isn't the same and should be encoded.
+                                    if &surf_buf[line_stride..line_stride + width as usize]
+                                        != dest_line_slice
+                                    {
+                                        same = false;
+                                    }
+
                                     dest_line_slice.copy_from_slice(
                                         &surf_buf[line_stride..line_stride + width as usize],
                                     );
                                 }
                             }
 
+                            if !same {
+                                let _ = state_clone
+                                    .encoder_tx
+                                    .blocking_lock()
+                                    .blocking_send(encoder_thread::EncodeThreadInput::SendFrame);
+                            }
+                            */
+
                             let _ = state_clone
                                 .encoder_tx
                                 .blocking_lock()
@@ -261,6 +322,12 @@ async fn main() -> anyhow::Result<()> {
 
                         RetroEvent::Resize { size } => {
                             // make a new frame for the encoder
+
+                            let _ = state_clone.encoder_tx.blocking_lock().blocking_send(
+                                encoder_thread::EncodeThreadInput::Init { size: size.clone() },
+                            );
+
+                            /*
                             {
                                 let mut lk_frame = frame_clone.lock().expect("Couldn't lock frame");
 
@@ -270,30 +337,21 @@ async fn main() -> anyhow::Result<()> {
                                     size.clone().height,
                                 ));
                             }
-
-                            let _ = state_clone.encoder_tx.blocking_lock().blocking_send(
-                                encoder_thread::EncodeThreadInput::Init { size: size.clone() },
-                            );
+                            */
                         }
 
                         RetroEvent::WantInputs { tx } => {
                             let inputs = state_clone.inputs.blocking_lock();
-                            //tracing::info!("giving inputs {:?}", inputs);
                             tx.send(inputs.clone()).expect("FUCK");
                         }
                     },
 
-                    Err(TryRecvError::Disconnected) => break,
-                    Err(TryRecvError::Empty) => {}
+                    None => break,
                 }
 
                 match encoder_rx.try_recv() {
                     Ok(msg) => match msg {
                         encoder_thread::EncodeThreadOutput::Frame { packet } => {
-                            // let _ = state_clone
-                            //     .websocket_broadcast_tx
-                            //     .send(WsMessage::VideoPacket { packet });
-
                             // :(
                             let packet_data = {
                                 let slice = packet.data().expect(
@@ -301,7 +359,9 @@ async fn main() -> anyhow::Result<()> {
                                 );
                                 slice.to_vec()
                             };
-                            let _ = transport_clone.broadcast_message(transport::TransportMessage::Binary(packet_data));
+                            let _ = state_clone.transport.broadcast_message(
+                                transport::TransportMessage::Binary(packet_data),
+                            );
                         }
                     },
                     Err(TryRecvError::Empty) => {}
diff --git a/server/src/retro_thread.rs b/server/src/retro_thread.rs
index 2e7cdcd..874a168 100644
--- a/server/src/retro_thread.rs
+++ b/server/src/retro_thread.rs
@@ -17,7 +17,7 @@ use retro_frontend::{
 use gpu::egl_helpers::DeviceContext;
 use letsplay_gpu as gpu;
 
-use crate::{surface::Surface, types::Size};
+use crate::{surface::Surface, types::Size, video::cuda_gl::safe::GraphicsResource};
 
 /// Called by OpenGL. We use this to dump errors.
 extern "system" fn opengl_message_callback(
@@ -49,30 +49,38 @@ pub struct RetroState {
     pad: RetroPad,
 
     // EGL state
-    egl_context: Option<DeviceContext>,
+    egl_context: Arc<Mutex<DeviceContext>>,
 
     /// Locked framebuffer.
-    framebuffer: Arc<Mutex<Surface>>,
+    software_framebuffer: Surface,
 
     /// OpenGL FBO
     gl_framebuffer: gpu::GlFramebuffer,
 
-    /// Cached readback buffer.
-    readback_buffer: Surface,
+    gl_rendering: bool,
+
+    cuda_resource: Arc<Mutex<GraphicsResource>>,
 
     event_tx: mpsc::Sender<RetroEvent>,
 }
 
 impl RetroState {
-    pub fn new(framebuffer: Arc<Mutex<Surface>>, event_tx: mpsc::Sender<RetroEvent>) -> Box<Self> {
+    pub fn new(
+        device_context: Arc<Mutex<DeviceContext>>,
+        resource: Arc<Mutex<GraphicsResource>>,
+        event_tx: mpsc::Sender<RetroEvent>,
+    ) -> Box<Self> {
         let mut boxed = Box::new(Self {
             frontend: None,
             pad: RetroPad::new(),
 
-            egl_context: None,
-            framebuffer,
+            egl_context: device_context.clone(),
+            software_framebuffer: Surface::new(),
             gl_framebuffer: gpu::GlFramebuffer::new(),
-            readback_buffer: Surface::new(),
+            gl_rendering: false,
+
+            cuda_resource: resource.clone(),
+
             event_tx,
         });
 
@@ -127,14 +135,29 @@ impl RetroState {
 
     /// Initalizes the headless EGL context used for OpenGL rendering.
     fn hw_gl_egl_init(&mut self) {
-        self.egl_context = Some(DeviceContext::new(0));
+        self.egl_context.lock().expect("piss").make_current();
+
+        unsafe {
+            // Load OpenGL functions using the EGL loader.
+            gl::load_with(|s| {
+                let str = std::ffi::CString::new(s).expect("gl::load_with fail");
+                std::mem::transmute(gpu::egl::GetProcAddress(str.as_ptr()))
+            });
+
+            // set OpenGL debug message callback
+            gl::Enable(gl::DEBUG_OUTPUT);
+            gl::DebugMessageCallback(Some(opengl_message_callback), std::ptr::null());
+        }
     }
 
     /// Destroys OpenGL resources and the EGL context.
     fn hw_gl_destroy(&mut self) {
-        if self.egl_context.is_some() {
-            self.gl_framebuffer.destroy();
-            self.egl_context.take().unwrap().destroy()
+        self.gl_framebuffer.destroy();
+
+        {
+            let mut locked = self.egl_context.lock().expect("piss");
+            locked.release();
+            locked.destroy();
         }
     }
 
@@ -143,59 +166,39 @@ impl RetroState {
         let step_ms = (1.0 / av_info.timing.fps) * 1000.;
         let step_duration = Duration::from_millis(step_ms as u64);
 
-        self.get_frontend().run_frame();
+        {
+            let egl = (&mut self.egl_context).clone();
+            let locked_egl = egl.lock().expect("fuck YOU");
+
+            locked_egl.make_current();
+            self.get_frontend().run_frame();
+            locked_egl.release();
+        }
+
         std::thread::sleep(step_duration);
     }
 
     /// Bleh, I don't like this is an associated fn, but whatever
-    fn update_impl(framebuffer: Arc<Mutex<Surface>>, slice: &[u32], pitch: u32, from_opengl: bool) {
-        let mut framebuffer_locked = framebuffer.lock().expect("could not lock framebuffer");
-
-        let size = framebuffer_locked.size.clone();
-        let buffer = framebuffer_locked.get_buffer();
+    fn update_software_framebuffer(framebuffer: &mut Surface, slice: &[u32], pitch: u32) {
+        let size = framebuffer.size.clone();
+        let buffer = framebuffer.get_buffer();
 
         let has_disconnected_pitch = pitch != size.width as u32;
 
-        // If this frame came from OpenGL we need to flip the image around
-        // so it is right side up (from our perspective).
-        //
-        // We do this in a bit of a convoluted way (but it's also zero allocation!)
-        if from_opengl {
-            for y in (0..size.height).rev() {
-                let src_line_off = (y as u32 * pitch) as usize;
-                let src_slice = &slice[src_line_off..src_line_off + size.width as usize];
+        for y in 0..size.height {
+            let src_line_off = (y as u32 * pitch) as usize;
+            let mut dest_line_off = (y as u32 * size.width) as usize;
 
-                let reversed_y = (size.height - 1) - y;
-
-                let src_line_off = (reversed_y as u32 * pitch) as usize;
-                let mut dest_line_off = src_line_off;
-
-                // copy only
-                if has_disconnected_pitch {
-                    dest_line_off = (reversed_y * pitch.min(size.width)) as usize;
-                }
-
-                let dest_slice = &mut buffer[dest_line_off..dest_line_off + size.width as usize];
-
-                dest_slice.copy_from_slice(src_slice);
+            // copy only
+            if has_disconnected_pitch {
+                dest_line_off = (y * pitch) as usize;
             }
-        } else {
-            for y in 0..size.height {
-                let src_line_off = (y as u32 * pitch) as usize;
-                let mut dest_line_off = src_line_off;
 
-                // copy only
-                if has_disconnected_pitch {
-                    dest_line_off = (y * size.width) as usize;
-                }
+            // Create slices repressenting each part
+            let src_slice = &slice[src_line_off..src_line_off + size.width as usize];
+            let dest_slice = &mut buffer[dest_line_off..dest_line_off + size.width as usize];
 
-                // Create slices repressenting each part
-                let src_slice = &slice[src_line_off..src_line_off + size.width as usize];
-                let dest_slice = &mut framebuffer_locked.get_buffer()
-                    [dest_line_off..dest_line_off + size.width as usize];
-
-                dest_slice.copy_from_slice(src_slice);
-            }
+            dest_slice.copy_from_slice(src_slice);
         }
     }
 }
@@ -204,21 +207,32 @@ impl FrontendInterface for RetroState {
     fn video_resize(&mut self, width: u32, height: u32) {
         tracing::info!("Resized to {width}x{height}");
 
-        if self.egl_context.is_some() {
-            self.gl_framebuffer.resize(width, height);
-            let raw = self.gl_framebuffer.as_raw();
+        self.gl_framebuffer.resize(width, height);
+        let raw = self.gl_framebuffer.as_raw();
 
-            // Notify the frontend layer about the new FBO ID
-            self.get_frontend().set_gl_fbo(raw);
+        // Notify the frontend layer about the new FBO ID
+        self.get_frontend().set_gl_fbo(raw);
 
-            // Resize the readback buffer
-            self.readback_buffer.resize(Size { width, height });
+        if !self.gl_rendering {
+            self.software_framebuffer.resize(Size { width, height });
         }
 
-        self.framebuffer
-            .lock()
-            .expect("its over?")
-            .resize(Size { width, height });
+        unsafe {
+        }
+
+        // map to cuda
+        {
+            let mut locked = self
+                .cuda_resource
+                .lock()
+                .expect("YOU MOTHERFUCKER PISS GO COUNT YOUR DICK");
+
+            locked.device().bind_to_thread().expect("fuck");
+
+            locked
+                .register(self.gl_framebuffer.texture_id(), gl::TEXTURE_2D)
+                .expect("you fucking asswater");
+        }
 
         let _ = self.event_tx.blocking_send(RetroEvent::Resize {
             size: Size { width, height },
@@ -226,39 +240,29 @@ impl FrontendInterface for RetroState {
     }
 
     fn video_update(&mut self, slice: &[u32], pitch: u32) {
-        Self::update_impl(self.framebuffer.clone(), slice, pitch, false);
+        Self::update_software_framebuffer(&mut self.software_framebuffer, slice, pitch);
+
+        let size = self.software_framebuffer.size.clone();
+
+        // upload texture to GPU
+        unsafe {
+            gl::TexImage2D(
+                gl::TEXTURE_2D,
+                0,
+                gl::RGBA as i32,
+                size.width as i32,
+                size.height as i32,
+                0,
+                gl::RGBA_INTEGER,
+                gl::UNSIGNED_INT_8_8_8_8,
+                self.software_framebuffer.get_buffer().as_mut_ptr() as *const _,
+            );
+        }
+
         let _ = self.event_tx.blocking_send(RetroEvent::Frame);
     }
 
     fn video_update_gl(&mut self) {
-        let dimensions = self.get_frontend().get_size();
-
-        // Read back the framebuffer
-        let slice = {
-            // lame, doesn't do bgra conversion for us
-            //self.gl_framebuffer.read_pixels(
-            //    &mut self.readback_buffer.get_buffer()[..],
-            //    dimensions.0,
-            //    dimensions.1,
-            //);
-
-            unsafe {
-                let _bind = self.gl_framebuffer.bind();
-                gl::ReadPixels(
-                    0,
-                    0,
-                    dimensions.0 as i32,
-                    dimensions.1 as i32,
-                    gl::BGRA,
-                    gl::UNSIGNED_BYTE,
-                    (&mut self.readback_buffer.get_buffer()).as_mut_ptr() as *mut std::ffi::c_void,
-                );
-            }
-
-            self.readback_buffer.get_buffer()
-        };
-
-        Self::update_impl(self.framebuffer.clone(), slice, dimensions.0, true);
         let _ = self.event_tx.blocking_send(RetroEvent::Frame);
     }
 
@@ -270,9 +274,13 @@ impl FrontendInterface for RetroState {
         let (tx, rx) = oneshot::channel();
         let _ = self.event_tx.blocking_send(RetroEvent::WantInputs { tx });
 
-        let inputs = rx.blocking_recv().expect("what the FUCK are you doing");
+        let inputs = rx.blocking_recv();
 
-        for key in &inputs {
+        if inputs.is_err() {
+            return;
+        }
+
+        for key in &inputs.unwrap() {
             use xkeysym::key as Key;
 
             match *key {
@@ -349,38 +357,23 @@ impl FrontendInterface for RetroState {
 
     fn hw_gl_init(&mut self) -> Option<HwGlInitData> {
         // Only create a new EGL/OpenGL context if we have to.
-        if self.egl_context.is_none() {
-            // Initalize EGL
-            self.hw_gl_egl_init();
+        let context = self.egl_context.lock().expect("fuck you!");
+        let extensions = gpu::egl_helpers::get_extensions(context.get_display());
 
-            let context = self.egl_context.as_ref().unwrap();
-            let extensions = gpu::egl_helpers::get_extensions(context.get_display());
+        tracing::debug!("Supported EGL extensions: {:?}", extensions);
 
-            tracing::debug!("Supported EGL extensions: {:?}", extensions);
-
-            // Check for EGL_KHR_get_all_proc_addresses, so we can use eglGetProcAddress() to load OpenGL functions
-            if !extensions.contains(&"EGL_KHR_get_all_proc_addresses".into()) {
-                tracing::error!("Your graphics driver doesn't support the EGL_KHR_get_all_proc_addresses extension.");
-                tracing::error!("Retrodemo currently needs this to load OpenGL functions. HW rendering will be disabled.");
-                return None;
-            }
-
-            unsafe {
-                // Load OpenGL functions using the EGL loader.
-                gl::load_with(|s| {
-                    let str = std::ffi::CString::new(s).expect("gl::load_with fail");
-                    std::mem::transmute(gpu::egl::GetProcAddress(str.as_ptr()))
-                });
-
-                // set OpenGL debug message callback
-                gl::Enable(gl::DEBUG_OUTPUT);
-                gl::DebugMessageCallback(Some(opengl_message_callback), std::ptr::null());
-            }
+        // Check for EGL_KHR_get_all_proc_addresses, so we can use eglGetProcAddress() to load OpenGL functions
+        if !extensions.contains(&"EGL_KHR_get_all_proc_addresses".into()) {
+            tracing::error!("Your graphics driver doesn't support the EGL_KHR_get_all_proc_addresses extension.");
+            tracing::error!("Retrodemo currently needs this to load OpenGL functions. HW rendering will be disabled.");
+            return None;
         }
 
-        // Create the initial FBO for the core to render to
-        let dimensions = self.get_frontend().get_size();
-        self.gl_framebuffer.resize(dimensions.0, dimensions.1);
+        // If we get here, we can be certain that we're no longer
+        // going to use software rendering. Therefore, we can
+        // clear (free) the software framebuffer, since it won't be of use to us anymore.
+        self.software_framebuffer.clear();
+        self.gl_rendering = true;
 
         return Some(HwGlInitData {
             get_proc_address: gpu::egl::GetProcAddress as *mut std::ffi::c_void,
@@ -403,25 +396,34 @@ pub enum RetroEvent {
 
 pub enum RetroInEvent {
     Start,
+    LoadCore(std::path::PathBuf),
+    LoadGame(std::path::PathBuf),
 }
 
 fn retro_thread_main(
-    surface: Arc<Mutex<Surface>>,
+    context: &Arc<Mutex<DeviceContext>>,
+    resource: &Arc<Mutex<GraphicsResource>>,
     event_tx: mpsc::Sender<RetroEvent>,
     mut event_rx: mpsc::Receiver<RetroInEvent>,
 ) {
-    let mut app = RetroState::new(surface, event_tx);
+    let mut app = RetroState::new(context.clone(), resource.clone(), event_tx);
 
-    app.load_core("cores/swanstation_libretro.so")
-        .expect("failed to load core");
-    app.load_game("roms/merged/nmv3/us/nmv3_us.cue") //merged/nmv1/us/nmv1_us.cue
-        .expect("failed to load game");
+    // do EGL init first
+    app.hw_gl_egl_init();
 
-    // sync
+    // pre-setup
     loop {
         match event_rx.blocking_recv() {
             None => return (),
             Some(msg) => match msg {
+                RetroInEvent::LoadCore(path) => {
+                    app.load_core(path).expect("Failed to load core.");
+                }
+
+                RetroInEvent::LoadGame(path) => {
+                    app.load_game(path).expect("Failed to load game!");
+                }
+
                 RetroInEvent::Start => break,
             },
         }
@@ -435,18 +437,23 @@ fn retro_thread_main(
 }
 
 pub fn spawn_retro_thread(
-    surface: Arc<Mutex<Surface>>,
+    context: Arc<Mutex<DeviceContext>>,
+    resource: Arc<Mutex<GraphicsResource>>,
 ) -> (mpsc::Receiver<RetroEvent>, mpsc::Sender<RetroInEvent>) {
-    let (event_tx, event_rx) = mpsc::channel(8);
-    let (event_in_tx, event_in_rx) = mpsc::channel(8);
-    let fb_clone = surface.clone();
+    // essentially semaphores
+    let (event_tx, event_rx) = mpsc::channel(1);
+    let (event_in_tx, event_in_rx) = mpsc::channel(1);
+
+    let cloned = resource.clone();
+    let ctxcloned = context.clone();
 
     // discard the join handle
     let _ = std::thread::Builder::new()
         .name("retro_game".into())
         .spawn(move || {
-            retro_thread_main(fb_clone, event_tx, event_in_rx);
-        }).expect("failed to spawn the game thread");
+            retro_thread_main(&ctxcloned, &cloned, event_tx, event_in_rx);
+        })
+        .expect("failed to spawn the game thread");
 
     (event_rx, event_in_tx)
 }
diff --git a/server/src/surface.rs b/server/src/surface.rs
index 8daad3d..4e1e237 100644
--- a/server/src/surface.rs
+++ b/server/src/surface.rs
@@ -7,7 +7,7 @@
 use super::types::*;
 use std::alloc;
 
-/// Allocates a boxed slice. 
+/// Allocates a boxed slice.
 /// Unlike a [Vec<_>], this can't grow,
 /// but is just as safe to use, and slightly more predictable.
 pub fn alloc_boxed_slice<T: Sized>(len: usize) -> Box<[T]> {
@@ -21,7 +21,7 @@ pub fn alloc_boxed_slice<T: Sized>(len: usize) -> Box<[T]> {
     unsafe { Box::from_raw(slice) }
 }
 
-/// A BGRA-format surface.
+/// A BGRA (or RGBA, I'm not your dad. There are funtions that assume the former though) format surface.
 pub struct Surface {
     buffer: Option<Box<[u32]>>,
     pub size: Size,
@@ -38,10 +38,18 @@ impl Surface {
         }
     }
 
+    pub fn clear(&mut self) {
+        self.buffer = None;
+        self.size = Size {
+            width: 0,
+            height: 0,
+        };
+    }
+
     pub fn resize(&mut self, size: Size) {
         self.size = size;
-        
-		self.buffer = Some(alloc_boxed_slice(self.size.linear()));
+
+        self.buffer = Some(alloc_boxed_slice(self.size.linear()));
     }
 
     pub fn get_buffer(&mut self) -> &mut [u32] {
@@ -53,15 +61,14 @@ impl Surface {
     pub fn blit_buffer(&mut self, src_at: Rect, data: &[u32]) {
         let mut off = 0;
 
-		let buf = self.buffer.as_mut().unwrap();
-		let buf_slice = &mut *buf;
+        let buf = self.buffer.as_mut().unwrap();
+        let buf_slice = &mut *buf;
 
         for y in src_at.y..src_at.y + src_at.height {
             let src = &data[off..off + src_at.width as usize];
             let dest_start_offset = (y as usize * self.size.width as usize) + src_at.x as usize;
 
-            let dest =
-                &mut buf_slice[dest_start_offset..dest_start_offset + src_at.width as usize];
+            let dest = &mut buf_slice[dest_start_offset..dest_start_offset + src_at.width as usize];
 
             // This forces alpha to always be 0xff. I *could* probably do this in a clearer way though :(
             for (dest, src_item) in dest.iter_mut().zip(src.iter()) {
diff --git a/server/src/video/cuda_gl/bindgen.sh b/server/src/video/cuda_gl/bindgen.sh
new file mode 100755
index 0000000..1f9dade
--- /dev/null
+++ b/server/src/video/cuda_gl/bindgen.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Does bindgen for CUDA cudaGL. (needs postprocessing)
+set -exu
+
+#  --allowlist-type="^CU.*" \
+#   --allowlist-type="^cuuint(32|64)_t" \
+#   --allowlist-type="^cudaError_enum" \
+#   --allowlist-type="^cu.*Complex$" \
+#   --allowlist-type="^cuda.*" \
+#   --allowlist-type="^libraryPropertyType.*" \
+#   --allowlist-var="^CU.*" \
+
+echo "use cudarc::sys::*; /* Hack :3 */" > ./sys.rs
+
+bindgen \
+  --allowlist-type="" \
+  --allowlist-function="^cuGraphicsGL.*" \
+  --default-enum-style=rust \
+  --no-doc-comments \
+  --with-derive-default \
+  --with-derive-eq \
+  --with-derive-hash \
+  --with-derive-ord \
+  --use-core \
+  --dynamic-loading Lib \
+  gl.h -- -I/opt/cuda/include \
+  >> ./sys.rs
\ No newline at end of file
diff --git a/server/src/video/cuda_gl/gl.h b/server/src/video/cuda_gl/gl.h
new file mode 100644
index 0000000..31e1454
--- /dev/null
+++ b/server/src/video/cuda_gl/gl.h
@@ -0,0 +1 @@
+#include "cudaGL.h"
\ No newline at end of file
diff --git a/server/src/video/cuda_gl/mod.rs b/server/src/video/cuda_gl/mod.rs
new file mode 100644
index 0000000..47cc631
--- /dev/null
+++ b/server/src/video/cuda_gl/mod.rs
@@ -0,0 +1,14 @@
+pub mod sys;
+use sys::*;
+
+pub mod safe;
+
+pub unsafe fn lib() -> &'static Lib {
+    static LIB: std::sync::OnceLock<Lib> = std::sync::OnceLock::new();
+    LIB.get_or_init(|| {
+        if let Ok(lib) = Lib::new(libloading::library_filename("cuda")) {
+            return lib;
+        }
+        panic!("cuda library doesn't exist.");
+    })
+}
\ No newline at end of file
diff --git a/server/src/video/cuda_gl/safe.rs b/server/src/video/cuda_gl/safe.rs
new file mode 100644
index 0000000..11f2b0c
--- /dev/null
+++ b/server/src/video/cuda_gl/safe.rs
@@ -0,0 +1,135 @@
+use cudarc::driver::{result as cuda_result, safe as cuda_safe, sys as cuda_sys, CudaDevice};
+
+use super::sys;
+
+use std::sync::Arc;
+
+pub struct MappedGraphicsResource {
+    resource: cuda_sys::CUgraphicsResource,
+}
+
+impl MappedGraphicsResource {
+    fn new(resource: cuda_sys::CUgraphicsResource) -> Self {
+        Self { resource }
+    }
+
+    pub fn map(&mut self) -> Result<(), cuda_result::DriverError> {
+        unsafe {
+            cuda_sys::lib()
+                .cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut())
+                .result()?;
+        }
+        Ok(())
+    }
+
+    pub fn unmap(&mut self) -> Result<(), cuda_result::DriverError> {
+        unsafe {
+            cuda_sys::lib()
+                .cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut())
+                .result()?;
+        }
+
+        Ok(())
+    }
+
+    pub fn get_mapped_array(&mut self) -> Result<cuda_sys::CUarray, cuda_result::DriverError> {
+        assert!(
+            !self.resource.is_null(),
+            "do not call GraphicsResource::get_mapped_array if no resource is actually registered"
+        );
+
+        let mut array: cuda_sys::CUarray = std::ptr::null_mut();
+
+        unsafe {
+            cuda_sys::lib()
+                .cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0)
+                .result()?;
+        }
+
+        Ok(array)
+    }
+}
+
+impl Drop for MappedGraphicsResource {
+    fn drop(&mut self) {
+        let _ = self.unmap();
+    }
+}
+
+/// Wrapper over cuGraphicsGL* apis
+pub struct GraphicsResource {
+    context: Arc<CudaDevice>,
+    resource: cuda_sys::CUgraphicsResource,
+}
+
+impl GraphicsResource {
+    pub fn new(device: &Arc<CudaDevice>) -> Self {
+        Self {
+            context: device.clone(),
+            resource: std::ptr::null_mut(),
+        }
+    }
+
+    pub fn device(&self) -> Arc<CudaDevice> {
+        self.context.clone()
+    }
+
+    /// Maps this resource.
+    pub fn map(&mut self) -> Result<MappedGraphicsResource, cuda_result::DriverError> {
+        let mut res = MappedGraphicsResource::new(self.resource);
+        res.map()?;
+
+        Ok(res)
+    }
+
+    pub fn register(
+        &mut self,
+        texture_id: gl::types::GLuint,
+        texture_kind: gl::types::GLuint,
+    ) -> Result<(), cuda_result::DriverError> {
+        // better to be safe than leak memory? idk.
+        if !self.resource.is_null() {
+            self.unregister()?;
+        }
+
+        unsafe {
+            super::lib()
+                .cuGraphicsGLRegisterImage(&mut self.resource, texture_id, texture_kind, 1)
+                .result()?;
+        }
+
+        Ok(())
+    }
+
+    pub fn is_registered(&self) -> bool {
+        !self.resource.is_null()
+    }
+
+    pub fn unregister(&mut self) -> Result<(), cuda_result::DriverError> {
+        assert!(
+            !self.resource.is_null(),
+            "do not call if no resource is actually registered"
+        );
+
+        unsafe {
+            cuda_sys::lib()
+                .cuGraphicsUnregisterResource(self.resource)
+                .result()?;
+        }
+
+        self.resource = std::ptr::null_mut();
+        Ok(())
+    }
+
+  
+}
+
+impl Drop for GraphicsResource {
+    fn drop(&mut self) {
+        if self.is_registered() {
+            let _ = self.unregister();
+        }
+    }
+}
+
+unsafe impl Send for GraphicsResource {}
diff --git a/server/src/video/cuda_gl/sys.rs b/server/src/video/cuda_gl/sys.rs
new file mode 100644
index 0000000..f0c3313
--- /dev/null
+++ b/server/src/video/cuda_gl/sys.rs
@@ -0,0 +1,73 @@
+use cudarc::driver::sys::*; /* Hack :3 */
+use gl::types::{GLenum, GLuint};
+/* automatically generated by rust-bindgen 0.69.4 */
+
+pub struct Lib {
+    __library: ::libloading::Library,
+    pub cuGraphicsGLRegisterBuffer: Result<
+        unsafe extern "C" fn(
+            pCudaResource: *mut CUgraphicsResource,
+            buffer: GLuint,
+            Flags: ::core::ffi::c_uint,
+        ) -> CUresult,
+        ::libloading::Error,
+    >,
+    pub cuGraphicsGLRegisterImage: Result<
+        unsafe extern "C" fn(
+            pCudaResource: *mut CUgraphicsResource,
+            image: GLuint,
+            target: GLenum,
+            Flags: ::core::ffi::c_uint,
+        ) -> CUresult,
+        ::libloading::Error,
+    >,
+}
+impl Lib {
+    pub unsafe fn new<P>(path: P) -> Result<Self, ::libloading::Error>
+    where
+        P: AsRef<::std::ffi::OsStr>,
+    {
+        let library = ::libloading::Library::new(path)?;
+        Self::from_library(library)
+    }
+    pub unsafe fn from_library<L>(library: L) -> Result<Self, ::libloading::Error>
+    where
+        L: Into<::libloading::Library>,
+    {
+        let __library = library.into();
+        let cuGraphicsGLRegisterBuffer = __library
+            .get(b"cuGraphicsGLRegisterBuffer\0")
+            .map(|sym| *sym);
+        let cuGraphicsGLRegisterImage = __library
+            .get(b"cuGraphicsGLRegisterImage\0")
+            .map(|sym| *sym);
+        Ok(Lib {
+            __library,
+            cuGraphicsGLRegisterBuffer,
+            cuGraphicsGLRegisterImage,
+        })
+    }
+    pub unsafe fn cuGraphicsGLRegisterBuffer(
+        &self,
+        pCudaResource: *mut CUgraphicsResource,
+        buffer: GLuint,
+        Flags: ::core::ffi::c_uint,
+    ) -> CUresult {
+        (self
+            .cuGraphicsGLRegisterBuffer
+            .as_ref()
+            .expect("Expected function, got error."))(pCudaResource, buffer, Flags)
+    }
+    pub unsafe fn cuGraphicsGLRegisterImage(
+        &self,
+        pCudaResource: *mut CUgraphicsResource,
+        image: GLuint,
+        target: GLenum,
+        Flags: ::core::ffi::c_uint,
+    ) -> CUresult {
+        (self
+            .cuGraphicsGLRegisterImage
+            .as_ref()
+            .expect("Expected function, got error."))(pCudaResource, image, target, Flags)
+    }
+}
diff --git a/server/src/video/encoder_thread.rs b/server/src/video/encoder_thread.rs
index d2247b2..d75ec75 100644
--- a/server/src/video/encoder_thread.rs
+++ b/server/src/video/encoder_thread.rs
@@ -1,11 +1,16 @@
+use cudarc::driver::{
+    sys::{CUdeviceptr, CUmemorytype},
+    CudaDevice, LaunchAsync,
+};
+use letsplay_gpu::egl_helpers::DeviceContext;
 use std::{
     sync::{Arc, Mutex},
     time::Duration,
 };
 use tokio::sync::mpsc::{self, error::TryRecvError};
 
-use super::ffmpeg;
 use super::h264_encoder::H264Encoder;
+use super::{cuda_gl::safe::GraphicsResource, ffmpeg};
 
 pub enum EncodeThreadInput {
     Init { size: crate::types::Size },
@@ -37,7 +42,7 @@ impl EncoderState {
         self.encoder = Some(H264Encoder::new_nvenc_swframe(
             size.clone(),
             60,
-            3 * (1024 * 1024),
+            2 * (1024 * 1024),
         )?);
 
         // replace packet
@@ -85,7 +90,76 @@ impl EncoderState {
     }
 }
 
-fn encoder_thread_main(
+struct EncoderStateHW {
+    encoder: Option<H264Encoder>,
+    frame: ffmpeg::frame::Video,
+    packet: ffmpeg::Packet,
+}
+
+impl EncoderStateHW {
+    fn new() -> Self {
+        Self {
+            encoder: None,
+            frame: ffmpeg::frame::Video::empty(),
+            packet: ffmpeg::Packet::empty(),
+        }
+    }
+
+    fn init(&mut self, device: &Arc<CudaDevice>, size: crate::types::Size) -> anyhow::Result<()> {
+        self.encoder = Some(H264Encoder::new_nvenc_hwframe(
+            &device,
+            size.clone(),
+            60,
+            2 * (1024 * 1024),
+        )?);
+
+        // replace packet
+        self.packet = ffmpeg::Packet::empty();
+        self.frame = self.encoder.as_mut().unwrap().create_frame()?;
+
+        Ok(())
+    }
+
+    #[inline]
+    fn frame(&mut self) -> &mut ffmpeg::frame::Video {
+        &mut self.frame
+    }
+
+    fn send_frame(&mut self, pts: u64, force_keyframe: bool) -> Option<ffmpeg::Packet> {
+        let frame = &mut self.frame;
+        let encoder = self.encoder.as_mut().unwrap();
+
+        // set frame metadata
+        unsafe {
+            if force_keyframe {
+                (*frame.as_mut_ptr()).pict_type = ffmpeg::sys::AVPictureType::AV_PICTURE_TYPE_I;
+                (*frame.as_mut_ptr()).flags = ffmpeg::sys::AV_FRAME_FLAG_KEY;
+                (*frame.as_mut_ptr()).key_frame = 1;
+            } else {
+                (*frame.as_mut_ptr()).pict_type = ffmpeg::sys::AVPictureType::AV_PICTURE_TYPE_NONE;
+                (*frame.as_mut_ptr()).flags = 0i32;
+                (*frame.as_mut_ptr()).key_frame = 0;
+            }
+
+            (*frame.as_mut_ptr()).pts = pts as i64;
+        }
+
+        encoder.send_frame(&*frame);
+        encoder
+            .receive_packet(&mut self.packet)
+            .expect("Failed to recieve packet");
+
+        unsafe {
+            if !self.packet.is_empty() {
+                return Some(self.packet.clone());
+            }
+        }
+
+        return None;
+    }
+}
+
+fn encoder_thread_swframe_main(
     mut rx: mpsc::Receiver<EncodeThreadInput>,
     tx: mpsc::Sender<EncodeThreadOutput>,
     frame: &Arc<Mutex<Option<ffmpeg::frame::Video>>>,
@@ -143,7 +217,7 @@ fn encoder_thread_main(
     Ok(())
 }
 
-pub fn encoder_thread_spawn(
+pub fn encoder_thread_spawn_swframe(
     frame: &Arc<Mutex<Option<ffmpeg::frame::Video>>>,
 ) -> (
     mpsc::Receiver<EncodeThreadOutput>,
@@ -154,7 +228,192 @@ pub fn encoder_thread_spawn(
 
     let clone = Arc::clone(frame);
 
-    std::thread::spawn(move || encoder_thread_main(in_rx, out_tx, &clone));
+    std::thread::spawn(move || encoder_thread_swframe_main(in_rx, out_tx, &clone));
+
+    (out_rx, in_tx)
+}
+
+fn encoder_thread_hwframe_main(
+    mut rx: mpsc::Receiver<EncodeThreadInput>,
+    tx: mpsc::Sender<EncodeThreadOutput>,
+
+    cuda_device: &Arc<CudaDevice>,
+    cuda_resource: &Arc<Mutex<GraphicsResource>>,
+    gl_context: &Arc<Mutex<DeviceContext>>,
+) -> anyhow::Result<()> {
+    let mut frame_number = 0u64;
+    let mut force_keyframe = false;
+
+    let mut encoder = EncoderStateHW::new();
+
+    // :)
+    cuda_device.bind_to_thread()?;
+
+    /*
+
+    let mut memcpy = cudarc::driver::sys::CUDA_MEMCPY2D_st::default();
+    // setup basic src stuff
+    memcpy.srcXInBytes = 0;
+    memcpy.srcY = 0;
+    memcpy.srcMemoryType = CUmemorytype::CU_MEMORYTYPE_ARRAY;
+
+    // dest
+    memcpy.dstXInBytes = 0;
+    memcpy.dstY = 0;
+    memcpy.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
+     */
+
+    loop {
+        match rx.try_recv() {
+            Ok(msg) => match msg {
+                EncodeThreadInput::Init { size } => {
+                    frame_number = 0;
+
+                    if force_keyframe {
+                        force_keyframe = false;
+                    }
+
+                    encoder
+                        .init(cuda_device, size)
+                        .expect("encoder init failed");
+                }
+
+                EncodeThreadInput::ForceKeyframe => {
+                    force_keyframe = true;
+                }
+
+                EncodeThreadInput::SendFrame => {
+                    // copy gl frame *ON THE GPU*
+                    {
+                        let mut gl_ctx = gl_context.lock().expect("you dumb fuck");
+                        let mut gl_resource =
+                            cuda_resource.lock().expect("couldnt lock GL resource!");
+
+                        gl_ctx.make_current();
+
+                        let mut mapped = gl_resource
+                            .map()
+                            .expect("couldnt map graphics resource. Its joever");
+
+                        let array = mapped
+                            .get_mapped_array()
+                            .expect("well its all over anyways");
+
+                        {
+                            let frame = encoder.frame();
+                            let width = frame.width();
+                            let height = frame.height();
+
+                            let launch_config = cudarc::driver::LaunchConfig {
+                                grid_dim: (width / 16 + 1, height / 2 + 1, 0),
+                                block_dim: (16, 2, 0),
+                                shared_mem_bytes: 0,
+                            };
+
+                            let flip_opengl = cuda_device
+                                .get_func("module", "flip_opengl")
+                                .expect("dumb fucker");
+
+                            unsafe {
+                                let frame_ptr = frame.as_mut_ptr();
+
+                                let mut params = [
+                                    array as *mut std::ffi::c_void,
+                                    (*frame_ptr).data[0] as CUdeviceptr as *mut std::ffi::c_void,
+                                    width as *const u32 as *mut std::ffi::c_void,
+                                    height as *const u32 as *mut std::ffi::c_void,
+                                ];
+
+                                flip_opengl
+                                    .launch(launch_config, &mut params[..])
+                                    .expect("oh its done");
+
+
+                            cudarc::driver::sys::lib()
+                            .cuStreamSynchronize(std::ptr::null_mut())
+                            .result()
+                            .expect("you banned");
+                            }
+                        }
+
+                        /*
+                        // setup the cuMemcpy2D
+                        {
+                            let frame = encoder.frame();
+
+                            memcpy.srcArray = array;
+
+                            unsafe {
+                                let frame_ptr = frame.as_mut_ptr();
+                                memcpy.dstDevice = (*frame_ptr).data[0] as CUdeviceptr;
+                                memcpy.dstPitch = (*frame_ptr).linesize[0] as usize;
+                                memcpy.WidthInBytes = ((*frame_ptr).width * 4) as usize;
+                                memcpy.Height = (*frame_ptr).height as usize;
+                            }
+                        }
+
+                        unsafe {
+                            cudarc::driver::sys::lib()
+                                .cuMemcpy2DAsync_v2(&memcpy, std::ptr::null_mut())
+                                .result()
+                                .expect("cuMemcpy2D fail epic");
+
+                            cudarc::driver::sys::lib()
+                                .cuStreamSynchronize(std::ptr::null_mut())
+                                .result()
+                                .expect("you banned");
+
+                        }                    */
+
+                        mapped.unmap().expect("fuck you asshole");
+                        gl_ctx.release();
+                    }
+
+                    if let Some(pkt) = encoder.send_frame(frame_number as u64, force_keyframe) {
+                        // A bit less clear than ::empty(), but it's "Safe"
+                        if let Some(_) = pkt.data() {
+                            let _ = tx.blocking_send(EncodeThreadOutput::Frame {
+                                packet: pkt.clone(),
+                            });
+                        }
+
+                        frame_number += 1;
+                    }
+
+                    if force_keyframe {
+                        force_keyframe = false;
+                    }
+                }
+            },
+
+            Err(TryRecvError::Disconnected) => break,
+            Err(TryRecvError::Empty) => {
+                std::thread::sleep(Duration::from_millis(1));
+            }
+        }
+    }
+
+    Ok(())
+}
+
+pub fn encoder_thread_spawn_hwframe(
+    cuda_device: &Arc<CudaDevice>,
+    cuda_resource: &Arc<Mutex<GraphicsResource>>,
+    gl_context: &Arc<Mutex<DeviceContext>>,
+) -> (
+    mpsc::Receiver<EncodeThreadOutput>,
+    mpsc::Sender<EncodeThreadInput>,
+) {
+    let (in_tx, in_rx) = mpsc::channel(1);
+    let (out_tx, out_rx) = mpsc::channel(1);
+
+    let dev_clone = Arc::clone(cuda_device);
+    let rsrc_clone = Arc::clone(cuda_resource);
+    let gl_clone = Arc::clone(gl_context);
+
+    std::thread::spawn(move || {
+        encoder_thread_hwframe_main(in_rx, out_tx, &dev_clone, &rsrc_clone, &gl_clone)
+    });
 
     (out_rx, in_tx)
 }
diff --git a/server/src/video/h264_encoder.rs b/server/src/video/h264_encoder.rs
index 6c0ebd2..313fbfb 100644
--- a/server/src/video/h264_encoder.rs
+++ b/server/src/video/h264_encoder.rs
@@ -135,8 +135,8 @@ impl H264Encoder {
 
         video_encoder_context.set_format(ffmpeg::format::Pixel::ZRGB32);
 
-        video_encoder_context.set_qmin(38);
-        video_encoder_context.set_qmax(32);
+        video_encoder_context.set_qmin(37);
+        video_encoder_context.set_qmax(33);
 
         // set h264_nvenc options
         let mut dict = ffmpeg::Dictionary::new();
@@ -148,7 +148,7 @@ impl H264Encoder {
 
         // TODO:
         dict.set("rc", "vbr");
-        dict.set("qp", "45");
+        dict.set("qp", "35");
 
         dict.set("forced-idr", "1");
 
@@ -169,9 +169,6 @@ impl H264Encoder {
         max_framerate: u32,
         bitrate: usize,
     ) -> anyhow::Result<Self> {
-        /*
-        (See FIXMEs above)
-
         let cuda_device_context = super::hwdevice::CudaDeviceContextBuilder::new()?
             .set_cuda_context((*cuda_device.cu_primary_ctx()) as *mut _)
             .build()
@@ -180,14 +177,53 @@ impl H264Encoder {
         let mut hw_frame_context = super::hwframe::HwFrameContextBuilder::new(cuda_device_context)?
             .set_width(size.width)
             .set_height(size.height)
-            .set_sw_format(ffmpeg::format::Pixel::ZRGB32)
+            .set_sw_format(ffmpeg::format::Pixel::ZBGR32)
             .set_format(ffmpeg::format::Pixel::CUDA)
             .build()
             .with_context(|| "while trying to create CUDA frame context")?;
 
-         */
+        let (mut encoder, mut video_encoder_context) =
+            create_context_and_set_common_parameters("h264_nvenc", &size, max_framerate, bitrate)
+                .with_context(|| "while trying to create encoder")?;
 
-        todo!("Implement me!");
+        video_encoder_context.set_format(ffmpeg::format::Pixel::CUDA);
+
+        video_encoder_context.set_qmin(37);
+        video_encoder_context.set_qmax(33);
+
+        unsafe {
+            // FIXME: this currently breaks the avbufferref system a bit
+            (*video_encoder_context.as_mut_ptr()).hw_frames_ctx = hw_frame_context.as_raw_mut();
+            (*video_encoder_context.as_mut_ptr()).hw_device_ctx =
+                hw_frame_context.as_device_context_mut();
+        }
+
+        // set h264_nvenc options
+        let mut dict = ffmpeg::Dictionary::new();
+
+        dict.set("tune", "ull");
+        dict.set("preset", "p1");
+
+        dict.set("profile", "main");
+
+        // TODO:
+        dict.set("rc", "vbr");
+        dict.set("qp", "35");
+
+        dict.set("forced-idr", "1");
+
+        // damn you
+        dict.set("delay", "0");
+        dict.set("zerolatency", "1");
+
+        let encoder = video_encoder_context
+            .open_as_with(encoder, dict)
+            .with_context(|| "While opening h264_nvenc video codec")?;
+
+        Ok(Self::NvencHWFrame {
+            encoder: encoder,
+            hw_context: hw_frame_context,
+        })
     }
 
     // NOTE: It's a bit pointless to have this have a mut borrow,
@@ -207,46 +243,44 @@ impl H264Encoder {
     //    }
     //}
 
-    pub fn create_frame(&mut self) -> ffmpeg::frame::Video {
-
+    pub fn create_frame(&mut self) -> anyhow::Result<ffmpeg::frame::Video> {
         match self {
             Self::Software { encoder } | Self::NvencSWFrame { encoder } => {
-                return ffmpeg::frame::Video::new(encoder.format(), encoder.width(), encoder.height());
+                return Ok(ffmpeg::frame::Video::new(
+                    encoder.format(),
+                    encoder.width(),
+                    encoder.height(),
+                ));
             }
 
             Self::NvencHWFrame {
                 encoder,
                 hw_context,
             } => {
-                todo!("Implement H264Encoder::create_frame() for NvencHWFrame!");
+                let mut frame = ffmpeg::frame::Video::empty();
+
+                unsafe {
+                    (*frame.as_mut_ptr()).format = ffmpeg::format::Pixel::CUDA as i32;
+                    (*frame.as_mut_ptr()).width = encoder.width() as i32;
+                    (*frame.as_mut_ptr()).height = encoder.height() as i32;
+                    (*frame.as_mut_ptr()).hw_frames_ctx = hw_context.as_raw_mut();
+
+                    //ffmpeg::sys::av_frame_get_buffer(frame.as_mut_ptr(), 32);
+
+                    hw_context.get_buffer(&mut frame)?;
+                    hw_context.get_buffer(&mut frame)?;
+
+                    (*frame.as_mut_ptr()).linesize[0] = (*frame.as_ptr()).width * 4;
+
+                    return Ok(frame);
+                }
             }
         }
 
         /*
-         unsafe {
-            let mut frame = ffmpeg::Frame::empty();
-            (*frame.as_mut_ptr()).format = pixel_format as i32;
-            (*frame.as_mut_ptr()).width = width as i32;
-            (*frame.as_mut_ptr()).height = height as i32;
-            (*frame.as_mut_ptr()).hw_frames_ctx = context.as_raw_mut();
 
-            super::check_ret(ffmpeg::sys::av_hwframe_get_buffer(
-                context.as_raw_mut(),
-                frame.as_mut_ptr(),
-                0,
-            ))?;
-            super::check_ret(ffmpeg::sys::av_hwframe_get_buffer(
-                context.as_raw_mut(),
-                frame.as_mut_ptr(),
-                0,
-            ))?;
 
-            (*frame.as_mut_ptr()).linesize[0] = (*frame.as_ptr()).width * 4;
-
-            Ok(frame)
-        }
-
-         */
+        */
 
         todo!("FIXME");
     }
@@ -267,7 +301,8 @@ impl H264Encoder {
                 encoder,
                 hw_context,
             } => {
-                todo!("Implement send_frame() for NvencHWFrame");
+                //todo!("Implement send_frame() for NvencHWFrame");
+                encoder.send_frame(frame).unwrap();
             }
         }
     }
diff --git a/server/src/video/hwframe.rs b/server/src/video/hwframe.rs
index e58f72a..721ca5c 100644
--- a/server/src/video/hwframe.rs
+++ b/server/src/video/hwframe.rs
@@ -7,82 +7,103 @@ use ffmpeg::format::Pixel;
 use super::{check_ret, hwdevice::CudaDeviceContext};
 
 pub struct HwFrameContext {
-	_cuda_device_context: CudaDeviceContext,
-	buffer: *mut ffmpeg::sys::AVBufferRef,
+    _cuda_device_context: CudaDeviceContext,
+    buffer: *mut ffmpeg::sys::AVBufferRef,
 }
 
 impl HwFrameContext {
-	fn new(cuda_device_context: CudaDeviceContext, buffer: *mut ffmpeg::sys::AVBufferRef) -> Self {
-		Self { _cuda_device_context: cuda_device_context, buffer }
-	}
+    fn new(cuda_device_context: CudaDeviceContext, buffer: *mut ffmpeg::sys::AVBufferRef) -> Self {
+        Self {
+            _cuda_device_context: cuda_device_context,
+            buffer,
+        }
+    }
 
-	// pub fn as_context_mut(&mut self) -> &mut ffmpeg::sys::AVHWFramesContext {
-	// 	unsafe { &mut *((*self.buffer).data as *mut ffmpeg::sys::AVHWFramesContext) }
-	// }
+    // pub fn as_context_mut(&mut self) -> &mut ffmpeg::sys::AVHWFramesContext {
+    // 	unsafe { &mut *((*self.buffer).data as *mut ffmpeg::sys::AVHWFramesContext) }
+    // }
 
-	// pub fn as_context(&self) -> &ffmpeg::sys::AVHWFramesContext {
-	// 	unsafe { &*((*self.buffer).data as *const ffmpeg::sys::AVHWFramesContext) }
-	// }
+    // pub fn as_context(&self) -> &ffmpeg::sys::AVHWFramesContext {
+    // 	unsafe { &*((*self.buffer).data as *const ffmpeg::sys::AVHWFramesContext) }
+    // }
 
-	pub fn as_raw_mut(&mut self) -> &mut ffmpeg::sys::AVBufferRef {
-		unsafe { &mut *self.buffer }
-	}
+    pub fn as_raw_mut(&mut self) -> &mut ffmpeg::sys::AVBufferRef {
+        unsafe { &mut *self.buffer }
+    }
 
-	// pub fn as_raw(&self) -> &ffmpeg::sys::AVBufferRef {
-	// 	unsafe { &*self.buffer }
-	// }
+    pub fn as_device_context_mut(&mut self) -> &mut ffmpeg::sys::AVBufferRef {
+        unsafe {
+            self._cuda_device_context.as_raw_mut()
+        }
+    }
+
+	/// call once to allocate frame
+    pub fn get_buffer(&mut self, frame: &mut ffmpeg::frame::Video) -> Result<(), ffmpeg::Error> {
+        unsafe {
+            super::check_ret(ffmpeg::sys::av_hwframe_get_buffer(self.buffer, frame.as_mut_ptr(), 0))?;
+        }
+
+		Ok(())
+    }
+
+    // pub fn as_raw(&self) -> &ffmpeg::sys::AVBufferRef {
+    // 	unsafe { &*self.buffer }
+    // }
 }
 
-unsafe impl Send for HwFrameContext { }
+unsafe impl Send for HwFrameContext {}
 
 pub struct HwFrameContextBuilder {
-	cuda_device_context: CudaDeviceContext,
-	buffer: *mut ffmpeg::sys::AVBufferRef,
+    cuda_device_context: CudaDeviceContext,
+    buffer: *mut ffmpeg::sys::AVBufferRef,
 }
 
 impl HwFrameContextBuilder {
-	pub fn new(mut cuda_device_context: CudaDeviceContext) -> anyhow::Result<Self> {
-		let buffer = unsafe { ffmpeg::sys::av_hwframe_ctx_alloc(cuda_device_context.as_raw_mut()) };
-		if buffer.is_null() {
-			return Err(anyhow::anyhow!("could not allocate a hwframe context"));
-		}
+    pub fn new(mut cuda_device_context: CudaDeviceContext) -> anyhow::Result<Self> {
+        let buffer = unsafe { ffmpeg::sys::av_hwframe_ctx_alloc(cuda_device_context.as_raw_mut()) };
+        if buffer.is_null() {
+            return Err(anyhow::anyhow!("could not allocate a hwframe context"));
+        }
 
-		Ok(Self { cuda_device_context, buffer })
-	}
+        Ok(Self {
+            cuda_device_context,
+            buffer,
+        })
+    }
 
-	pub fn build(mut self) -> Result<HwFrameContext, ffmpeg::Error> {
-		check_ret(unsafe { ffmpeg::sys::av_hwframe_ctx_init(self.buffer) })?;
-		let result = Ok(HwFrameContext::new(self.cuda_device_context, self.buffer));
-		self.buffer = null_mut();
+    pub fn build(mut self) -> Result<HwFrameContext, ffmpeg::Error> {
+        check_ret(unsafe { ffmpeg::sys::av_hwframe_ctx_init(self.buffer) })?;
+        let result = Ok(HwFrameContext::new(self.cuda_device_context, self.buffer));
+        self.buffer = null_mut();
 
-		result
-	}
+        result
+    }
 
-	pub fn set_width(mut self, width: u32) -> Self {
-		self.as_frame_mut().width = width as i32;
-		self
-	}
+    pub fn set_width(mut self, width: u32) -> Self {
+        self.as_frame_mut().width = width as i32;
+        self
+    }
 
-	pub fn set_height(mut self, height: u32) -> Self {
-		self.as_frame_mut().height = height as i32;
-		self
-	}
+    pub fn set_height(mut self, height: u32) -> Self {
+        self.as_frame_mut().height = height as i32;
+        self
+    }
 
-	pub fn set_sw_format(mut self, sw_format: Pixel) -> Self {
-		self.as_frame_mut().sw_format = sw_format.into();
-		self
-	}
+    pub fn set_sw_format(mut self, sw_format: Pixel) -> Self {
+        self.as_frame_mut().sw_format = sw_format.into();
+        self
+    }
 
-	pub fn set_format(mut self, format: Pixel) -> Self {
-		self.as_frame_mut().format = format.into();
-		self
-	}
+    pub fn set_format(mut self, format: Pixel) -> Self {
+        self.as_frame_mut().format = format.into();
+        self
+    }
 
-	pub fn as_frame_mut(&mut self) -> &mut ffmpeg::sys::AVHWFramesContext {
-		unsafe { &mut *((*self.buffer).data as *mut ffmpeg::sys::AVHWFramesContext) }
-	}
+    pub fn as_frame_mut(&mut self) -> &mut ffmpeg::sys::AVHWFramesContext {
+        unsafe { &mut *((*self.buffer).data as *mut ffmpeg::sys::AVHWFramesContext) }
+    }
 
-	// pub fn as_frame(&self) -> &ffmpeg::sys::AVHWFramesContext {
-	// 	unsafe { &*((*self.buffer).data as *const ffmpeg::sys::AVHWFramesContext) }
-	// }
-}
\ No newline at end of file
+    // pub fn as_frame(&self) -> &ffmpeg::sys::AVHWFramesContext {
+    // 	unsafe { &*((*self.buffer).data as *const ffmpeg::sys::AVHWFramesContext) }
+    // }
+}
diff --git a/server/src/video/mod.rs b/server/src/video/mod.rs
index 1af486d..59adcdf 100644
--- a/server/src/video/mod.rs
+++ b/server/src/video/mod.rs
@@ -9,6 +9,8 @@ pub mod hwframe;
 
 pub mod encoder_thread;
 
+pub mod cuda_gl;
+
 // from hgaiser/moonshine
 pub fn check_ret(error_code: i32) -> Result<(), ffmpeg::Error> {
 	if error_code != 0 {