diff --git a/.gitignore b/.gitignore
index 8c1430b..a3fdb33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,5 @@ client/.parcel-cache
 client/dist
 client/node_modules
 
-/server/target
\ No newline at end of file
+/server/target
+/server/testdata
\ No newline at end of file
diff --git a/server/src/main.rs b/server/src/main.rs
index ee3bc01..f10899d 100644
--- a/server/src/main.rs
+++ b/server/src/main.rs
@@ -22,33 +22,12 @@ use std::{
     time::Duration,
 };
 
-use rand::distributions::DistString;
 use std::net::SocketAddr;
 use tokio::sync::{
-    broadcast,
     mpsc::{self, error::TryRecvError},
     Mutex as TokioMutex,
 };
 
-use axum::{
-    extract::{
-        connect_info::ConnectInfo,
-        ws::{self, Message, WebSocket, WebSocketUpgrade},
-        State,
-    },
-    response::IntoResponse,
-    routing::get,
-    Router,
-};
-
-use futures::{sink::SinkExt, stream::StreamExt};
-
-#[derive(Clone)]
-enum WsMessage {
-    VideoPacket { packet: ffmpeg::Packet },
-    Json(String),
-}
-
 struct AppState<T: Transport> {
     encoder_tx: Arc<TokioMutex<mpsc::Sender<EncodeThreadInput>>>,
     inputs: Arc<TokioMutex<Vec<u32>>>,
@@ -184,23 +163,6 @@ where
     }
 }
 
-const FLIP_SRC: &str = "
-extern \"C\" __global__ void flip_opengl(
-    const unsigned* pSrc,
-    unsigned* pDest,
-    unsigned width, 
-    unsigned height
-) {
-    const unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (x < width && y < height) {
-        // let reversed_y = (size.height - 1) - y;
-        unsigned reversed_y = (height - 1) - y;
-        pDest[y * width + x] = pSrc[reversed_y * width + x];
-    }
-}";
-
 #[tokio::main(flavor = "multi_thread", worker_threads = 2)]
 async fn main() -> anyhow::Result<()> {
     // Setup a tracing subscriber
@@ -213,13 +175,6 @@ async fn main() -> anyhow::Result<()> {
     // H.264 encoder related
     let device = CudaDevice::new(0)?;
 
-    let ptx = cudarc::nvrtc::compile_ptx(&FLIP_SRC).with_context(|| "compiling support kernel")?;
-
-    println!("compiled kernel");
-
-    // pop it in
-    device.load_ptx(ptx, "module", &["flip_opengl"])?;
-
     let egl_ctx = Arc::new(Mutex::new(DeviceContext::new(0)));
 
     let resource = Arc::new(Mutex::new(GraphicsResource::new(&device)));
@@ -247,7 +202,7 @@ async fn main() -> anyhow::Result<()> {
                 "cores/swanstation_libretro.so".into(),
             ));
             let _ = retro_input_event_tx.blocking_send(retro_thread::RetroInEvent::LoadGame(
-                "roms/nmv1_us.cue".into(),
+                "roms/merged/nmv2/jagb/nmv2jagb.cue".into(),
             ));
 
             // start the libretro thread looping now that we're alive
@@ -257,63 +212,6 @@ async fn main() -> anyhow::Result<()> {
                 match retro_event_rx.blocking_recv() {
                     Some(msg) => match msg {
                         RetroEvent::Frame => {
-                            /*
-                            let mut same = true;
-
-                            {
-                                let mut frame_locked = frame.lock().expect(
-                                    "Couldn't lock frame on our end. Did the encoder thread panic?",
-                                );
-
-                                let mut_frame = frame_locked.as_mut().expect("it's None? why?");
-
-                                let width = mut_frame.width();
-                                let height = mut_frame.height();
-
-                                let mut surf = surface_clone.lock().expect(
-                                "locking the VNC surface to paint it to the ffmpeg frame failed",
-                            );
-
-
-                                let surf_buf = surf.get_buffer();
-
-                                let buf_ptr =
-                                    unsafe { (*(*mut_frame.as_mut_ptr()).buf[0]).data as *mut u32 };
-
-                                for y in 0..height {
-                                    let line_stride = (y * width) as usize;
-                                    // Make a slice for the line
-                                    // SAFETY: The allocation is guaranteed to be large enough
-                                    // for this to work from y = 0..height
-                                    let dest_line_slice = unsafe {
-                                        let dest_line_ptr = buf_ptr.add(line_stride);
-                                        std::slice::from_raw_parts_mut(
-                                            dest_line_ptr,
-                                            width as usize,
-                                        )
-                                    };
-
-                                    // If any line differs then the frame isn't the same and should be encoded.
-                                    if &surf_buf[line_stride..line_stride + width as usize]
-                                        != dest_line_slice
-                                    {
-                                        same = false;
-                                    }
-
-                                    dest_line_slice.copy_from_slice(
-                                        &surf_buf[line_stride..line_stride + width as usize],
-                                    );
-                                }
-                            }
-
-                            if !same {
-                                let _ = state_clone
-                                    .encoder_tx
-                                    .blocking_lock()
-                                    .blocking_send(encoder_thread::EncodeThreadInput::SendFrame);
-                            }
-                            */
-
                             let _ = state_clone
                                 .encoder_tx
                                 .blocking_lock()
@@ -321,23 +219,9 @@ async fn main() -> anyhow::Result<()> {
                         }
 
                         RetroEvent::Resize { size } => {
-                            // make a new frame for the encoder
-
                             let _ = state_clone.encoder_tx.blocking_lock().blocking_send(
                                 encoder_thread::EncodeThreadInput::Init { size: size.clone() },
                             );
-
-                            /*
-                            {
-                                let mut lk_frame = frame_clone.lock().expect("Couldn't lock frame");
-
-                                *lk_frame = Some(ffmpeg::frame::Video::new(
-                                    ffmpeg::format::Pixel::BGRA,
-                                    size.clone().width,
-                                    size.clone().height,
-                                ));
-                            }
-                            */
                         }
 
                         RetroEvent::WantInputs { tx } => {
diff --git a/server/src/video/cuda_gl/safe.rs b/server/src/video/cuda_gl/safe.rs
index 11f2b0c..5a1fa60 100644
--- a/server/src/video/cuda_gl/safe.rs
+++ b/server/src/video/cuda_gl/safe.rs
@@ -48,6 +48,24 @@ impl MappedGraphicsResource {
 
         Ok(array)
     }
+
+    pub fn get_device_pointer(&mut self) -> Result<cuda_sys::CUdeviceptr, cuda_result::DriverError> {
+        assert!(
+            !self.resource.is_null(),
+            "do not call GraphicsResource::get_mapped_array if no resource is actually registered"
+        );
+
+        let mut array: cuda_sys::CUdeviceptr = 0;
+        let mut size: usize = 0;
+
+        unsafe {
+            cuda_sys::lib()
+                .cuGraphicsResourceGetMappedPointer_v2(&mut array, &mut size, self.resource)
+                .result()?;
+        }
+
+        Ok(array)
+    }
 }
 
 impl Drop for MappedGraphicsResource {
diff --git a/server/src/video/encoder_thread.rs b/server/src/video/encoder_thread.rs
index d75ec75..dd5f7a4 100644
--- a/server/src/video/encoder_thread.rs
+++ b/server/src/video/encoder_thread.rs
@@ -1,7 +1,8 @@
-use cudarc::driver::{
+use anyhow::Context;
+use cudarc::{driver::{
     sys::{CUdeviceptr, CUmemorytype},
-    CudaDevice, LaunchAsync,
-};
+    CudaDevice, CudaSlice, DevicePtr, LaunchAsync,
+}, nvrtc::CompileOptions};
 use letsplay_gpu::egl_helpers::DeviceContext;
 use std::{
     sync::{Arc, Mutex},
@@ -233,6 +234,23 @@ pub fn encoder_thread_spawn_swframe(
     (out_rx, in_tx)
 }
 
+/// Source for the kernel used to flip OpenGL framebuffers right-side up.
+const OPENGL_FLIP_KERNEL_SRC: &str = "
+extern \"C\" __global__ void flip_opengl(
+    const unsigned* pSrc,
+    unsigned* pDest,
+    int width, 
+    int height
+) {
+    const unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < width && y < height) {
+        unsigned reversed_y = (height - 1) - y;
+        ((unsigned*)pDest)[y * width + x] = ((unsigned*)pSrc)[reversed_y * width + x];
+    }
+}";
+
 fn encoder_thread_hwframe_main(
     mut rx: mpsc::Receiver<EncodeThreadInput>,
     tx: mpsc::Sender<EncodeThreadOutput>,
@@ -249,7 +267,18 @@ fn encoder_thread_hwframe_main(
     // :)
     cuda_device.bind_to_thread()?;
 
-    /*
+    // Compile the support kernel
+    let ptx = cudarc::nvrtc::compile_ptx_with_opts(
+        &OPENGL_FLIP_KERNEL_SRC,
+        CompileOptions {
+            //options: vec!["--gpu-architecture=compute_50".into()],
+            ..Default::default()
+        },
+    )
+    .with_context(|| "compiling support kernel")?;
+
+    // pop it in
+    cuda_device.load_ptx(ptx, "module", &["flip_opengl"])?;
 
     let mut memcpy = cudarc::driver::sys::CUDA_MEMCPY2D_st::default();
     // setup basic src stuff
@@ -261,7 +290,9 @@ fn encoder_thread_hwframe_main(
     memcpy.dstXInBytes = 0;
     memcpy.dstY = 0;
     memcpy.dstMemoryType = CUmemorytype::CU_MEMORYTYPE_DEVICE;
-     */
+
+    // Temporary buffer used for opengl flip on the GPU
+    let mut temp_buffer: CudaSlice<u32> = cuda_device.alloc_zeros::<u32>(48).expect("over");
 
     loop {
         match rx.try_recv() {
@@ -273,6 +304,10 @@ fn encoder_thread_hwframe_main(
                         force_keyframe = false;
                     }
 
+                    temp_buffer = cuda_device
+                        .alloc_zeros::<u32>((size.width * size.height) as usize)
+                        .expect("oh youre fucked anyways");
+
                     encoder
                         .init(cuda_device, size)
                         .expect("encoder init failed");
@@ -299,53 +334,17 @@ fn encoder_thread_hwframe_main(
                             .get_mapped_array()
                             .expect("well its all over anyways");
 
+                        let frame = encoder.frame();
+
+                        // setup the cuMemcpy2D operation to copy to the temporary buffer
+                        // (we should probably abstract source and provide a way to elide this,
+                        // and instead feed ffmpeg directly. for now it's *just* used with gl so /shrug)
                         {
-                            let frame = encoder.frame();
-                            let width = frame.width();
-                            let height = frame.height();
-
-                            let launch_config = cudarc::driver::LaunchConfig {
-                                grid_dim: (width / 16 + 1, height / 2 + 1, 0),
-                                block_dim: (16, 2, 0),
-                                shared_mem_bytes: 0,
-                            };
-
-                            let flip_opengl = cuda_device
-                                .get_func("module", "flip_opengl")
-                                .expect("dumb fucker");
-
-                            unsafe {
-                                let frame_ptr = frame.as_mut_ptr();
-
-                                let mut params = [
-                                    array as *mut std::ffi::c_void,
-                                    (*frame_ptr).data[0] as CUdeviceptr as *mut std::ffi::c_void,
-                                    width as *const u32 as *mut std::ffi::c_void,
-                                    height as *const u32 as *mut std::ffi::c_void,
-                                ];
-
-                                flip_opengl
-                                    .launch(launch_config, &mut params[..])
-                                    .expect("oh its done");
-
-
-                            cudarc::driver::sys::lib()
-                            .cuStreamSynchronize(std::ptr::null_mut())
-                            .result()
-                            .expect("you banned");
-                            }
-                        }
-
-                        /*
-                        // setup the cuMemcpy2D
-                        {
-                            let frame = encoder.frame();
-
                             memcpy.srcArray = array;
 
                             unsafe {
                                 let frame_ptr = frame.as_mut_ptr();
-                                memcpy.dstDevice = (*frame_ptr).data[0] as CUdeviceptr;
+                                memcpy.dstDevice = temp_buffer.device_ptr().clone();
                                 memcpy.dstPitch = (*frame_ptr).linesize[0] as usize;
                                 memcpy.WidthInBytes = ((*frame_ptr).width * 4) as usize;
                                 memcpy.Height = (*frame_ptr).height as usize;
@@ -357,13 +356,47 @@ fn encoder_thread_hwframe_main(
                                 .cuMemcpy2DAsync_v2(&memcpy, std::ptr::null_mut())
                                 .result()
                                 .expect("cuMemcpy2D fail epic");
+                        }
 
-                            cudarc::driver::sys::lib()
-                                .cuStreamSynchronize(std::ptr::null_mut())
-                                .result()
-                                .expect("you banned");
+                        // launch kernel to flip the opengl framebuffer right-side up
+                        {
+                            let width = frame.width();
+                            let height = frame.height();
 
-                        }                    */
+                            let launch_config = cudarc::driver::LaunchConfig {
+                                grid_dim: (width / 16 + 1, height / 2 + 1, 1),
+                                block_dim: (16, 2, 1),
+                                shared_mem_bytes: 0,
+                            };
+
+                            let flip_opengl = cuda_device
+                                .get_func("module", "flip_opengl")
+                                .expect("dumb fucker");
+
+                            unsafe {
+                                let frame_ptr = frame.as_mut_ptr();
+
+                                let mut slice = cuda_device.upgrade_device_ptr::<u32>(
+                                    (*frame_ptr).data[0] as CUdeviceptr,
+                                    (width * height) as usize * 4usize,
+                                );
+
+                                flip_opengl
+                                    .launch(
+                                        launch_config,
+                                        (&mut temp_buffer, &mut slice, width, height),
+                                    )
+                                    .expect("I hate you");
+
+                                cudarc::driver::sys::lib()
+                                    .cuStreamSynchronize(std::ptr::null_mut())
+                                    .result()
+                                    .expect("you banned");
+
+                                // leak so it doesnt free the memory like a dumbass
+                                let _ = slice.leak();
+                            }
+                        }
 
                         mapped.unmap().expect("fuck you asshole");
                         gl_ctx.release();
diff --git a/server/src/video/h264_encoder.rs b/server/src/video/h264_encoder.rs
index 313fbfb..583ddfd 100644
--- a/server/src/video/h264_encoder.rs
+++ b/server/src/video/h264_encoder.rs
@@ -163,6 +163,10 @@ impl H264Encoder {
         Ok(Self::NvencSWFrame { encoder: encoder })
     }
 
+    /// Creates a new hardware (NVIDIA NVENC) encoder, which encodes
+    /// frames from GPU memory, via CUDA.
+    /// FFmpeg handles uploading frames to the GPU.
+    /// You are expected to handle uploading or otherwise working with a frame on the GPU.
     pub fn new_nvenc_hwframe(
         cuda_device: &CudaDevice,
         size: Size,
@@ -188,8 +192,8 @@ impl H264Encoder {
 
         video_encoder_context.set_format(ffmpeg::format::Pixel::CUDA);
 
-        video_encoder_context.set_qmin(37);
-        video_encoder_context.set_qmax(33);
+        video_encoder_context.set_qmin(35);
+        video_encoder_context.set_qmax(30);
 
         unsafe {
             // FIXME: this currently breaks the avbufferref system a bit