yuzu/src/core/hw/y2r.cpp

// Copyright 2015 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.

#include <algorithm>
#include <array>
#include <cstddef>
#include <memory>

#include "common/assert.h"
#include "common/color.h"
#include "common/common_types.h"
#include "common/math_util.h"
#include "common/vector_math.h"

#include "core/hle/service/y2r_u.h"
#include "core/memory.h"

namespace HW {
namespace Y2R {

using namespace Y2R_U;

static const size_t MAX_TILES = 1024 / 8;
static const size_t TILE_SIZE = 8 * 8;
using ImageTile = std::array<u32, TILE_SIZE>;

/// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles.
static void ConvertYUVToRGB(InputFormat input_format,
        const u8* input_Y, const u8* input_U, const u8* input_V, ImageTile output[],
        unsigned int width, unsigned int height, const CoefficientSet& coefficients) {

    for (unsigned int y = 0; y < height; ++y) {
        for (unsigned int x = 0; x < width; ++x) {
            s32 Y, U, V;
            switch (input_format) {
            case InputFormat::YUV422_Indiv8:
            case InputFormat::YUV422_Indiv16:
                Y = input_Y[y * width + x];
                U = input_U[(y * width + x) / 2];
                V = input_V[(y * width + x) / 2];
                break;
            case InputFormat::YUV420_Indiv8:
            case InputFormat::YUV420_Indiv16:
                Y = input_Y[y * width + x];
                U = input_U[((y / 2) * width + x) / 2];
                V = input_V[((y / 2) * width + x) / 2];
                break;
            case InputFormat::YUYV422_Interleaved:
                Y = input_Y[(y * width + x) * 2];
                U = input_Y[(y * width + (x / 2) * 2) * 2 + 1];
                V = input_Y[(y * width + (x / 2) * 2) * 2 + 3];
                break;
            }

            // This conversion process is bit-exact with hardware, as far as could be tested.
            auto& c = coefficients;
            s32 cY = c[0]*Y;

            s32 r = cY          + c[1]*V;
            s32 g = cY - c[3]*U - c[2]*V;
            s32 b = cY + c[4]*U;

            const s32 rounding_offset = 0x18;
            r = (r >> 3) + c[5] + rounding_offset;
            g = (g >> 3) + c[6] + rounding_offset;
            b = (b >> 3) + c[7] + rounding_offset;

            unsigned int tile = x / 8;
            unsigned int tile_x = x % 8;
            u32* out = &output[tile][y * 8 + tile_x];

            using MathUtil::Clamp;
            *out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) |
                   ((u32)Clamp(g >> 5, 0, 0xFF) << 16) |
                   ((u32)Clamp(b >> 5, 0, 0xFF) << 8);
        }
    }
}

/// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit formats to 8-bit.
template <size_t N>
static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) {
    const u8* input = Memory::GetPointer(buf.address);

    size_t output_unit = buf.transfer_unit / N;
    ASSERT(amount_of_data % output_unit == 0);

    while (amount_of_data > 0) {
        for (size_t i = 0; i < output_unit; ++i) {
            output[i] = input[i * N];
        }

        output += output_unit;
        input += buf.transfer_unit + buf.gap;

        buf.address += buf.transfer_unit + buf.gap;
        buf.image_size -= buf.transfer_unit;
        amount_of_data -= output_unit;
    }
}

/// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA transfer.
static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data,
        OutputFormat output_format, u8 alpha) {

    u8* output = Memory::GetPointer(buf.address);

    while (amount_of_data > 0) {
        u8* unit_end = output + buf.transfer_unit;
        while (output < unit_end) {
            u32 color = *input++;
            Math::Vec4<u8> col_vec{
                (color >> 24) & 0xFF, (color >> 16) & 0xFF, (color >>  8) & 0xFF, alpha,
            };

            switch (output_format) {
            case OutputFormat::RGBA8:
                Color::EncodeRGBA8(col_vec, output);
                output += 4;
                break;
            case OutputFormat::RGB8:
                Color::EncodeRGB8(col_vec, output);
                output += 3;
                break;
            case OutputFormat::RGB5A1:
                Color::EncodeRGB5A1(col_vec, output);
                output += 2;
                break;
            case OutputFormat::RGB565:
                Color::EncodeRGB565(col_vec, output);
                output += 2;
                break;
            }

            amount_of_data -= 1;
        }

        output += buf.gap;
        buf.address += buf.transfer_unit + buf.gap;
        buf.image_size -= buf.transfer_unit;
    }
}

static const u8 linear_lut[64] = {
     0,  1,  2,  3,  4,  5,  6,  7,
     8,  9, 10, 11, 12, 13, 14, 15,
    16, 17, 18, 19, 20, 21, 22, 23,
    24, 25, 26, 27, 28, 29, 30, 31,
    32, 33, 34, 35, 36, 37, 38, 39,
    40, 41, 42, 43, 44, 45, 46, 47,
    48, 49, 50, 51, 52, 53, 54, 55,
    56, 57, 58, 59, 60, 61, 62, 63,
};

static const u8 morton_lut[64] = {
     0,  1,  4,  5, 16, 17, 20, 21,
     2,  3,  6,  7, 18, 19, 22, 23,
     8,  9, 12, 13, 24, 25, 28, 29,
    10, 11, 14, 15, 26, 27, 30, 31,
    32, 33, 36, 37, 48, 49, 52, 53,
    34, 35, 38, 39, 50, 51, 54, 55,
    40, 41, 44, 45, 56, 57, 60, 61,
    42, 43, 46, 47, 58, 59, 62, 63,
};

static void RotateTile0(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
    for (int i = 0; i < height * 8; ++i) {
        output[out_map[i]] = input[i];
    }
}

static void RotateTile90(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
    int out_i = 0;
    for (int x = 0; x < 8; ++x) {
        for (int y = height - 1; y >= 0; --y) {
            output[out_map[out_i++]] = input[y * 8 + x];
        }
    }
}

static void RotateTile180(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
    int out_i = 0;
    for (int i = height * 8 - 1; i >= 0; --i) {
        output[out_map[out_i++]] = input[i];
    }
}

static void RotateTile270(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {
    int out_i = 0;
    for (int x = 8-1; x >= 0; --x) {
        for (int y = 0; y < height; ++y) {
            output[out_map[out_i++]] = input[y * 8 + x];
        }
    }
}

static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) {
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < 8; ++x) {
            output[y * line_stride + x] = tile[y * 8 + x];
        }
    }
}

/**
 * Performs a Y2R colorspace conversion.
 *
 * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most
 * commonly used for video playback or to display camera input to the screen.
 *
 * The conversion process is quite configurable, and can be divided in distinct steps. From
 * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data
 * internally and converts it in one go before writing to the output and loading the next strip.
 *
 * The steps taken to convert one strip of image data are:
 *
 * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is
 *   presumably stored in one or more internal buffers. This process can be done in several separate
 *   transfers, as long as they don't exceed the size of the internal image buffer. This allows
 *   flexibility in input strides.
 * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat`
 *   enum.
 * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured
 *   using a set of coefficients to support different colorspace standards. See `CoefficientSet`.
 * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed
 *   independently, this notably rotates each *strip*, not the entire image. This means that for 90
 *   or 270 degree rotations, the output will be in terms of several 8 x height images, and for any
 *   non-zero rotation the strips will have to be re-arranged so that the parts of the image will
 *   not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90
 *   or 270 degree rotations in images with non-even height don't seem to work properly.
 * - The data is converted to the output RGB format. See the `OutputFormat` enum.
 * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by
 *   the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the
 *   image must have a height divisible by 8. The image width must always be divisible by 8.
 * - The final data is then CDMAed out to main memory and the next image strip is processed. This
 *   offers the same flexibility as the input stage.
 *
 * In this implementation, to avoid the combinatorial explosion of parameter combinations, common
 * intermediate formats are used and where possible tables or parameters are used instead of
 * diverging code paths to keep the amount of branches in check. Some steps are also merged to
 * increase efficiency.
 *
 * Output for all valid settings combinations matches hardware, however output in some edge-cases
 * differs:
 *
 * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last
 *   strip, especially when combined with rotation.
 * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation
 *   produces misaligned output on the last strip. This implmentation produces output with the
 *   correct "expected" alignment.
 *
 * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases,
 * so they are believed to be invalid configurations anyway.
 */
void PerformConversion(ConversionConfiguration& cvt) {
    ASSERT(cvt.input_line_width % 8 == 0);
    ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0);
    // Tiles per row
    size_t num_tiles = cvt.input_line_width / 8;
    ASSERT(num_tiles < MAX_TILES);

    // Buffer used as a CDMA source/target.
    std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]);
    // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32.
    std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]);
    ImageTile tmp_tile;

    // LUT used to remap writes to a tile. Used to allow linear or swizzled output without
    // requiring two different code paths.
    const u8* tile_remap;
    switch (cvt.block_alignment) {
    case BlockAlignment::Linear:
        tile_remap = linear_lut; break;
    case BlockAlignment::Block8x8:
        tile_remap = morton_lut; break;
    }

    for (unsigned int y = 0; y < cvt.input_lines; y += 8) {
        unsigned int row_height = std::min(cvt.input_lines - y, 8u);

        // Total size in pixels of incoming data required for this strip.
        const size_t row_data_size = row_height * cvt.input_line_width;

        u8* input_Y = data_buffer.get();
        u8* input_U = input_Y + 8 * cvt.input_line_width;
        u8* input_V = input_U + 8 * cvt.input_line_width / 2;

        switch (cvt.input_format) {
        case InputFormat::YUV422_Indiv8:
            ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
            ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2);
            ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2);
            break;
        case InputFormat::YUV420_Indiv8:
            ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);
            ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4);
            ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4);
            break;
        case InputFormat::YUV422_Indiv16:
            ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
            ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2);
            ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2);
            break;
        case InputFormat::YUV420_Indiv16:
            ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);
            ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4);
            ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4);
            break;
        case InputFormat::YUYV422_Interleaved:
            input_U = nullptr;
            input_V = nullptr;
            ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2);
            break;
        }

        // Note(yuriks): If additional optimization is required, input_format can be moved to a
        // template parameter, so that its dispatch can be moved to outside the inner loop.
        ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(),
                cvt.input_line_width, row_height, cvt.coefficients);

        u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get());

        for (int i = 0; i < num_tiles; ++i) {
            int image_strip_width, output_stride;

            switch (cvt.rotation) {
            case Rotation::None:
                RotateTile0(tiles[i], tmp_tile, row_height, tile_remap);
                image_strip_width = cvt.input_line_width;
                output_stride = 8;
                break;
            case Rotation::Clockwise_90:
                RotateTile90(tiles[i], tmp_tile, row_height, tile_remap);
                image_strip_width = 8;
                output_stride = 8 * row_height;
                break;
            case Rotation::Clockwise_180:
                // For 180 and 270 degree rotations we also invert the order of tiles in the strip,
                // since the rotates are done individually on each tile.
                RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
                image_strip_width = cvt.input_line_width;
                output_stride = 8;
                break;
            case Rotation::Clockwise_270:
                RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
                image_strip_width = 8;
                output_stride = 8 * row_height;
                break;
            }

            switch (cvt.block_alignment) {
            case BlockAlignment::Linear:
                WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width);
                output_buffer += output_stride;
                break;
            case BlockAlignment::Block8x8:
                WriteTileToOutput(output_buffer, tmp_tile, 8, 8);
                output_buffer += TILE_SIZE;
                break;
            }
        }

        // Note(yuriks): If additional optimization is required, output_format can be moved to a
        // template parameter, so that its dispatch can be moved to outside the inner loop.
        SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size, cvt.output_format, (u8)cvt.alpha);
    }
}

}
}
Y2R: Rework conversion process, enabling support for all formats 2015-06-07 21:24:03 -04:00			`// Copyright 2015 Citra Emulator Project`
			`// Licensed under GPLv2 or any later version`
			`// Refer to the license.txt file included.`

Core: Cleanup hw includes. 2015-06-27 12:56:17 -04:00			`#include <algorithm>`
Y2R: Rework conversion process, enabling support for all formats 2015-06-07 21:24:03 -04:00			`#include <array>`
Core: Cleanup hw includes. 2015-06-27 12:56:17 -04:00			`#include <cstddef>`
			`#include <memory>`
Y2R: Rework conversion process, enabling support for all formats 2015-06-07 21:24:03 -04:00
			`#include "common/assert.h"`
			`#include "common/color.h"`
			`#include "common/common_types.h"`
			`#include "common/math_util.h"`
			`#include "common/vector_math.h"`

			`#include "core/hle/service/y2r_u.h"`
			`#include "core/memory.h"`

			`namespace HW {`
			`namespace Y2R {`

			`using namespace Y2R_U;`

			`static const size_t MAX_TILES = 1024 / 8;`
			`static const size_t TILE_SIZE = 8 * 8;`
			`using ImageTile = std::array<u32, TILE_SIZE>;`

			`/// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles.`
			`static void ConvertYUVToRGB(InputFormat input_format,`
			`const u8* input_Y, const u8* input_U, const u8* input_V, ImageTile output[],`
			`unsigned int width, unsigned int height, const CoefficientSet& coefficients) {`

			`for (unsigned int y = 0; y < height; ++y) {`
			`for (unsigned int x = 0; x < width; ++x) {`
			`s32 Y, U, V;`
			`switch (input_format) {`
			`case InputFormat::YUV422_Indiv8:`
			`case InputFormat::YUV422_Indiv16:`
			`Y = input_Y[y * width + x];`
			`U = input_U[(y * width + x) / 2];`
			`V = input_V[(y * width + x) / 2];`
			`break;`
			`case InputFormat::YUV420_Indiv8:`
			`case InputFormat::YUV420_Indiv16:`
			`Y = input_Y[y * width + x];`
			`U = input_U[((y / 2) * width + x) / 2];`
			`V = input_V[((y / 2) * width + x) / 2];`
			`break;`
			`case InputFormat::YUYV422_Interleaved:`
			`Y = input_Y[(y * width + x) * 2];`
			`U = input_Y[(y * width + (x / 2) * 2) * 2 + 1];`
			`V = input_Y[(y * width + (x / 2) * 2) * 2 + 3];`
			`break;`
			`}`

			`// This conversion process is bit-exact with hardware, as far as could be tested.`
			`auto& c = coefficients;`
			`s32 cY = c[0]*Y;`

			`s32 r = cY + c[1]*V;`
			`s32 g = cY - c[3]U - c[2]V;`
			`s32 b = cY + c[4]*U;`

			`const s32 rounding_offset = 0x18;`
			`r = (r >> 3) + c[5] + rounding_offset;`
			`g = (g >> 3) + c[6] + rounding_offset;`
			`b = (b >> 3) + c[7] + rounding_offset;`

			`unsigned int tile = x / 8;`
			`unsigned int tile_x = x % 8;`
			`u32* out = &output[tile][y * 8 + tile_x];`

			`using MathUtil::Clamp;`
			`*out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) \|`
			`((u32)Clamp(g >> 5, 0, 0xFF) << 16) \|`
			`((u32)Clamp(b >> 5, 0, 0xFF) << 8);`
			`}`
			`}`
			`}`

			`/// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit formats to 8-bit.`
			`template <size_t N>`
			`static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) {`
			`const u8* input = Memory::GetPointer(buf.address);`

			`size_t output_unit = buf.transfer_unit / N;`
			`ASSERT(amount_of_data % output_unit == 0);`

			`while (amount_of_data > 0) {`
			`for (size_t i = 0; i < output_unit; ++i) {`
			`output[i] = input[i * N];`
			`}`

			`output += output_unit;`
			`input += buf.transfer_unit + buf.gap;`

			`buf.address += buf.transfer_unit + buf.gap;`
			`buf.image_size -= buf.transfer_unit;`
			`amount_of_data -= output_unit;`
			`}`
			`}`

			`/// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA transfer.`
			`static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data,`
			`OutputFormat output_format, u8 alpha) {`

			`u8* output = Memory::GetPointer(buf.address);`

			`while (amount_of_data > 0) {`
			`u8* unit_end = output + buf.transfer_unit;`
			`while (output < unit_end) {`
			`u32 color = *input++;`
			`Math::Vec4<u8> col_vec{`
			`(color >> 24) & 0xFF, (color >> 16) & 0xFF, (color >> 8) & 0xFF, alpha,`
			`};`

			`switch (output_format) {`
			`case OutputFormat::RGBA8:`
			`Color::EncodeRGBA8(col_vec, output);`
			`output += 4;`
			`break;`
			`case OutputFormat::RGB8:`
			`Color::EncodeRGB8(col_vec, output);`
			`output += 3;`
			`break;`
			`case OutputFormat::RGB5A1:`
			`Color::EncodeRGB5A1(col_vec, output);`
			`output += 2;`
			`break;`
			`case OutputFormat::RGB565:`
			`Color::EncodeRGB565(col_vec, output);`
			`output += 2;`
			`break;`
			`}`

			`amount_of_data -= 1;`
			`}`

			`output += buf.gap;`
			`buf.address += buf.transfer_unit + buf.gap;`
			`buf.image_size -= buf.transfer_unit;`
			`}`
			`}`

			`static const u8 linear_lut[64] = {`
			`0, 1, 2, 3, 4, 5, 6, 7,`
			`8, 9, 10, 11, 12, 13, 14, 15,`
			`16, 17, 18, 19, 20, 21, 22, 23,`
			`24, 25, 26, 27, 28, 29, 30, 31,`
			`32, 33, 34, 35, 36, 37, 38, 39,`
			`40, 41, 42, 43, 44, 45, 46, 47,`
			`48, 49, 50, 51, 52, 53, 54, 55,`
			`56, 57, 58, 59, 60, 61, 62, 63,`
			`};`

			`static const u8 morton_lut[64] = {`
			`0, 1, 4, 5, 16, 17, 20, 21,`
			`2, 3, 6, 7, 18, 19, 22, 23,`
			`8, 9, 12, 13, 24, 25, 28, 29,`
			`10, 11, 14, 15, 26, 27, 30, 31,`
			`32, 33, 36, 37, 48, 49, 52, 53,`
			`34, 35, 38, 39, 50, 51, 54, 55,`
			`40, 41, 44, 45, 56, 57, 60, 61,`
			`42, 43, 46, 47, 58, 59, 62, 63,`
			`};`

			`static void RotateTile0(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {`
			`for (int i = 0; i < height * 8; ++i) {`
			`output[out_map[i]] = input[i];`
			`}`
			`}`

			`static void RotateTile90(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {`
			`int out_i = 0;`
			`for (int x = 0; x < 8; ++x) {`
			`for (int y = height - 1; y >= 0; --y) {`
			`output[out_map[out_i++]] = input[y * 8 + x];`
			`}`
			`}`
			`}`

			`static void RotateTile180(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {`
			`int out_i = 0;`
			`for (int i = height * 8 - 1; i >= 0; --i) {`
			`output[out_map[out_i++]] = input[i];`
			`}`
			`}`

			`static void RotateTile270(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) {`
			`int out_i = 0;`
			`for (int x = 8-1; x >= 0; --x) {`
			`for (int y = 0; y < height; ++y) {`
			`output[out_map[out_i++]] = input[y * 8 + x];`
			`}`
			`}`
			`}`

			`static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) {`
			`for (int y = 0; y < height; ++y) {`
			`for (int x = 0; x < 8; ++x) {`
			`output[y * line_stride + x] = tile[y * 8 + x];`
			`}`
			`}`
			`}`

			`/**`
			`* Performs a Y2R colorspace conversion.`
			`*`
			`* The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most`
			`* commonly used for video playback or to display camera input to the screen.`
			`*`
			`* The conversion process is quite configurable, and can be divided in distinct steps. From`
			`* observation, it appears that the hardware buffers a single 8-pixel tall strip of image data`
			`* internally and converts it in one go before writing to the output and loading the next strip.`
			`*`
			`* The steps taken to convert one strip of image data are:`
			`*`
			`* - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is`
			`* presumably stored in one or more internal buffers. This process can be done in several separate`
			`* transfers, as long as they don't exceed the size of the internal image buffer. This allows`
			`* flexibility in input strides.`
			* - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat`
			`* enum.`
			`* - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured`
			* using a set of coefficients to support different colorspace standards. See `CoefficientSet`.
			`* - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed`
			`* independently, this notably rotates each strip, not the entire image. This means that for 90`
			`* or 270 degree rotations, the output will be in terms of several 8 x height images, and for any`
			`* non-zero rotation the strips will have to be re-arranged so that the parts of the image will`
			`* not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90`
			`* or 270 degree rotations in images with non-even height don't seem to work properly.`
			* - The data is converted to the output RGB format. See the `OutputFormat` enum.
			`* - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by`
			* the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the
			`* image must have a height divisible by 8. The image width must always be divisible by 8.`
			`* - The final data is then CDMAed out to main memory and the next image strip is processed. This`
			`* offers the same flexibility as the input stage.`
			`*`
			`* In this implementation, to avoid the combinatorial explosion of parameter combinations, common`
			`* intermediate formats are used and where possible tables or parameters are used instead of`
			`* diverging code paths to keep the amount of branches in check. Some steps are also merged to`
			`* increase efficiency.`
			`*`
			`* Output for all valid settings combinations matches hardware, however output in some edge-cases`
			`* differs:`
			`*`
			* - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last
			`* strip, especially when combined with rotation.`
			* - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation
			`* produces misaligned output on the last strip. This implmentation produces output with the`
			`* correct "expected" alignment.`
			`*`
			`* Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases,`
			`* so they are believed to be invalid configurations anyway.`
			`*/`
			`void PerformConversion(ConversionConfiguration& cvt) {`
			`ASSERT(cvt.input_line_width % 8 == 0);`
			`ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 \|\| cvt.input_lines % 8 == 0);`
			`// Tiles per row`
			`size_t num_tiles = cvt.input_line_width / 8;`
			`ASSERT(num_tiles < MAX_TILES);`

			`// Buffer used as a CDMA source/target.`
			`std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]);`
			`// Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32.`
			`std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]);`
			`ImageTile tmp_tile;`

			`// LUT used to remap writes to a tile. Used to allow linear or swizzled output without`
			`// requiring two different code paths.`
			`const u8* tile_remap;`
			`switch (cvt.block_alignment) {`
			`case BlockAlignment::Linear:`
			`tile_remap = linear_lut; break;`
			`case BlockAlignment::Block8x8:`
			`tile_remap = morton_lut; break;`
			`}`

			`for (unsigned int y = 0; y < cvt.input_lines; y += 8) {`
			`unsigned int row_height = std::min(cvt.input_lines - y, 8u);`

			`// Total size in pixels of incoming data required for this strip.`
			`const size_t row_data_size = row_height * cvt.input_line_width;`

			`u8* input_Y = data_buffer.get();`
			`u8* input_U = input_Y + 8 * cvt.input_line_width;`
			`u8* input_V = input_U + 8 * cvt.input_line_width / 2;`

			`switch (cvt.input_format) {`
			`case InputFormat::YUV422_Indiv8:`
			`ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);`
			`ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2);`
			`ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2);`
			`break;`
			`case InputFormat::YUV420_Indiv8:`
			`ReceiveData<1>(input_Y, cvt.src_Y, row_data_size);`
			`ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4);`
			`ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4);`
			`break;`
			`case InputFormat::YUV422_Indiv16:`
			`ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);`
			`ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2);`
			`ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2);`
			`break;`
			`case InputFormat::YUV420_Indiv16:`
			`ReceiveData<2>(input_Y, cvt.src_Y, row_data_size);`
			`ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4);`
			`ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4);`
			`break;`
			`case InputFormat::YUYV422_Interleaved:`
			`input_U = nullptr;`
			`input_V = nullptr;`
			`ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2);`
			`break;`
			`}`

			`// Note(yuriks): If additional optimization is required, input_format can be moved to a`
			`// template parameter, so that its dispatch can be moved to outside the inner loop.`
			`ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(),`
			`cvt.input_line_width, row_height, cvt.coefficients);`

			`u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get());`

			`for (int i = 0; i < num_tiles; ++i) {`
			`int image_strip_width, output_stride;`

			`switch (cvt.rotation) {`
			`case Rotation::None:`
			`RotateTile0(tiles[i], tmp_tile, row_height, tile_remap);`
			`image_strip_width = cvt.input_line_width;`
			`output_stride = 8;`
			`break;`
			`case Rotation::Clockwise_90:`
			`RotateTile90(tiles[i], tmp_tile, row_height, tile_remap);`
			`image_strip_width = 8;`
			`output_stride = 8 * row_height;`
			`break;`
			`case Rotation::Clockwise_180:`
			`// For 180 and 270 degree rotations we also invert the order of tiles in the strip,`
			`// since the rotates are done individually on each tile.`
			`RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);`
			`image_strip_width = cvt.input_line_width;`
			`output_stride = 8;`
			`break;`
			`case Rotation::Clockwise_270:`
			`RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);`
			`image_strip_width = 8;`
			`output_stride = 8 * row_height;`
			`break;`
			`}`

			`switch (cvt.block_alignment) {`
			`case BlockAlignment::Linear:`
			`WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width);`
			`output_buffer += output_stride;`
			`break;`
			`case BlockAlignment::Block8x8:`
			`WriteTileToOutput(output_buffer, tmp_tile, 8, 8);`
			`output_buffer += TILE_SIZE;`
			`break;`
			`}`
			`}`

			`// Note(yuriks): If additional optimization is required, output_format can be moved to a`
			`// template parameter, so that its dispatch can be moved to outside the inner loop.`
			`SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size, cvt.output_format, (u8)cvt.alpha);`
			`}`
			`}`

			`}`
			`}`