Compare commits

..

10 commits

Author SHA1 Message Date
ac32c07d7f add license and README 2024-10-22 03:00:07 -04:00
59ac84466f use arguments to configure instead of hardcoded paths 2024-10-22 02:59:58 -04:00
9af77c6159 use threadpool for stuff 2024-10-22 00:49:49 -04:00
a997a356d2 it kind of works now 2024-10-21 21:21:37 -04:00
0eaca7d0c9 fix subvariant parsing/insertion into tree 2024-10-21 20:50:52 -04:00
6f41bc9180 clean up tree root detection 2024-10-21 19:56:43 -04:00
9bdff0be14 fix parse bug I accidentally introduced 2024-10-21 12:38:39 -04:00
f8285f9e4c cleanup 2024-10-21 12:27:40 -04:00
af49206a16 Rewrite it in C++
rust people are going to be very mad at me now /j
2024-10-21 12:22:35 -04:00
20d72e0cd2 reimplement the tree in c++
(mostly for testing)
2024-10-21 11:16:15 -04:00
19 changed files with 708 additions and 0 deletions

44
.clang-format Executable file
View file

@ -0,0 +1,44 @@
BasedOnStyle: Google
# force T* or T&
DerivePointerAlignment: false
PointerAlignment: Left
TabWidth: 4
IndentWidth: 4
UseTab: Always
IndentPPDirectives: BeforeHash
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortFunctionsOnASingleLine: InlineOnly
AllowShortIfStatementsOnASingleLine: Never
AllowShortLoopsOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: true
BinPackArguments: true
BinPackParameters: true
BreakConstructorInitializers: BeforeColon
BreakStringLiterals: false
ColumnLimit: 150
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ContinuationIndentWidth: 0
# turning this on causes major issues with initializer lists
Cpp11BracedListStyle: false
SpaceBeforeCpp11BracedList: true
FixNamespaceComments: true
NamespaceIndentation: All
ReflowComments: true
SortIncludes: CaseInsensitive
SortUsingDeclarations: true
SpacesInSquareBrackets: false
SpaceBeforeParens: Never
SpacesBeforeTrailingComments: 1

7
.gitignore vendored Normal file
View file

@ -0,0 +1,7 @@
/__pycache__
/.cache
*.o
/vxorg
/tree_test
/testdata
/compile_commands.json

3
.gitmodules vendored Normal file
View file

@ -0,0 +1,3 @@
[submodule "indicators"]
path = indicators
url = https://github.com/p-ranav/indicators

7
LICENSE Normal file
View file

@ -0,0 +1,7 @@
Copyright 2023-2024 Lily Tsuru
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

22
Makefile Normal file
View file

@ -0,0 +1,22 @@
CXX = g++ -std=c++23 -O3 -I indicators/include
all: vxorg tree_test
clean:
rm vxorg tree_test *.o
vxorg: vxorg.o vxheaven_parse.o threadpool.o
$(CXX) $^ -o $@
tree_test: tree_test.o
$(CXX) $^ -o $@
%.o: %.cpp
$(CXX) -c $< -o $@
# dep rules
# I feel like it's 1970 again
tree_test.o: tree.hpp
vxorg.o: tree.hpp vxheaven_parse.hpp threadpool.hpp
vxheaven_parse.o: tree.hpp vxheaven_parse.hpp

24
README.md Normal file
View file

@ -0,0 +1,24 @@
# vxorg
vxheaven organizer (converts it from a flat hierarchy of ~270k+ files to a neat tree). Originally written in python, I rewrote it in C++ for performance reasons.
# History
- 2018: I wrote a really shoddy attempt at doing organization in Bash. It sucked because I wasn't taking care of many idosyncracies about sample naming.
- It also was very primitive and slow, since it would continually spawn `mv` processes just to move files. (same for `mkdir` too, but that is less of a concern since it's done less)
- 2023: I wrote a new script in Python. It was "better" but still didn't work
- I actually made the same mistake and tried to write in Bash again, but even Python was worlds faster, so I rewrote it in python
- October 21, 2024: I decided to start rewriting the Python script I wrote to parse into a N-ary tree for memory savings while still allowing memoization. (and be modular instead of one blob)
- Later in the day, as an experiment, I rewrote the parsing algorithm (fixing a bug in the process) in C++. It was 100x faster, so I committed to a rewrite in C++
# Building
`make`
# Usage
- Generate a list of samples.
- `tar tf xxx/viruses-2010-05-18.tar.bz2 | sed 's/\.\///g' | awk NF | sort > list` is one option. Not the best but it's (basically) what I did
- Run with `./vxorg list src/ dest/`
- `dest/` will be created if it does not exist.
- It will show a progress bar as it completes.

1
indicators Submodule

@ -0,0 +1 @@
Subproject commit 9c855c95e7782541a419597242535562fa9e41d7

3
python_refonly/README.md Normal file
View file

@ -0,0 +1,3 @@
# python
This was the original vxorg refactor/rewrite. It was abanodoned because the tree algorithms in python were so unbearably slow that even a iffy c++ reimplementation of the same tree is 100x faster

70
threadpool.cpp Normal file
View file

@ -0,0 +1,70 @@
#include "threadpool.hpp"
void ThreadPool::ThreadEntry(ThreadPool* pPool, std::size_t myIndex) {
auto& pool = *pPool;
// set a cutesy name
#ifdef __linux__
pthread_setname_np(pthread_self(), "PoolWorker");
#endif
// The thread loop
while(true) {
{
// wait for at least a single task, or shutdown notification (one of the two)
std::unique_lock lk(pool.taskQueues[myIndex].lock);
pool.queueCv.wait(lk, [&]() {
if(pool.threadsShouldShutdown)
return true;
return !pool.taskQueues[myIndex].queue.empty();
});
}
// Exit if the pool is to shutdown
if(pool.threadsShouldShutdown && pPool->QueueEmpty(myIndex))
break;
// pop and run tasks until we run out of tasks to run
{
std::unique_lock lk(pool.taskQueues[myIndex].lock);
// TODO: Work-steal from other threads.
while(!pool.taskQueues[myIndex].queue.empty()) {
auto& cb = pool.taskQueues[myIndex].queue.back();
cb();
pool.taskQueues[myIndex].queue.pop_back();
}
}
}
}
void ThreadPool::launch(std::size_t nrThreads) {
threadsShouldShutdown = false;
this->nrThreads = nrThreads;
threads.resize(this->nrThreads);
taskQueues = new TaskQueue[this->nrThreads];
for(std::size_t i = 0; i < this->nrThreads; ++i)
threads.emplace_back(std::thread(&ThreadEntry, this, i));
}
// Shutdown the thread pool
void ThreadPool::shutdown() {
if(!threadsShouldShutdown)
threadsShouldShutdown = true;
queueCv.notify_all();
// join all the threads (if possible) to make sure they all exit
for(auto& thread : threads)
if(thread.joinable())
thread.join();
nrThreads = 0;
delete[] taskQueues;
taskQueues = nullptr;
}

92
threadpool.hpp Normal file
View file

@ -0,0 +1,92 @@
#include <atomic>
#include <condition_variable>
#include <cstddef>
#include <cstdlib>
#include <deque>
#include <functional>
#include <mutex>
#include <thread>
#include <vector>
/// A simple thread pool executor.
/// Not at all optimized, and probably terrible for latency.
struct ThreadPool {
ThreadPool() = default;
// shorthand to call launch(nrThreads) automatically
inline explicit ThreadPool(std::size_t nrThreads) { launch(nrThreads); }
// move could be allowed, I guess
ThreadPool(const ThreadPool&) = delete;
ThreadPool(ThreadPool&&) = delete;
inline ~ThreadPool() { shutdown(); }
// takes anything that is callable with void() signature
// This includes capturable lambdas, so be careful or make sure you're locking state!
template <class Callable>
void add_task(Callable&& cb) {
auto worker = PickWorker();
//printf("picked worker %zu\n", worker);
// N.B: These wrappers still allow the thread to progress
if(QueueLength(worker) >= 4) {
//std::printf("queue for worker %zu too large. Blocking until it is empty\n", worker);
while(!QueueEmpty(worker)) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
}
// add it to the task queue for that thread
{
std::unique_lock lk(this->taskQueues[worker].lock);
taskQueues[worker].queue.push_front(cb);
}
// Wake threads up if they are waiting for work
queueCv.notify_all();
}
void launch(std::size_t nrThreads);
// Shutdown the thread pool
void shutdown();
private:
// could just use unique_ptr<T[]> for both of these,
// or an analogue, since they will only increase or decrease in size on a call to launch()
std::vector<std::thread> threads {}; // or analogue
std::size_t nrThreads = 0;
struct TaskQueue {
std::mutex lock {};
std::deque<std::function<void()>> queue {};
};
TaskQueue* taskQueues {};
// Used to notify threads when work is available or to shutdown
std::condition_variable queueCv {};
/// Used to notify on shutdown
std::atomic_bool threadsShouldShutdown { false };
// implement these out of line
std::size_t QueueLength(std::size_t worker) const {
std::unique_lock lk(this->taskQueues[worker].lock);
return this->taskQueues[worker].queue.size();
}
bool QueueEmpty(std::size_t worker) const {
std::unique_lock lk(this->taskQueues[worker].lock);
return this->taskQueues[worker].queue.empty();
}
std::size_t PickWorker() const {
return std::rand() % nrThreads;
}
static void ThreadEntry(ThreadPool* pPool, std::size_t myIndex);
};

93
tree.hpp Normal file
View file

@ -0,0 +1,93 @@
#pragma once
#include <vector>
/// A simplistic N-ary/generic tree. Probably not very good for data locality.
template <class T>
struct Tree {
// FIXME:
// - make T not require default constructability
// - move instead of copy into leaf
// - use "btree-like" repressentation of N-ary nodes to save memory
struct Node {
protected:
friend Tree;
Node* parent = nullptr;
std::vector<Node*> children {};
T item {};
public:
~Node() {
for(auto& child : children)
delete child;
}
T& data() { return item; }
const T& data() const { return item; }
bool is_leaf() const { return children.size() == 0; }
bool is_root() const { return parent == nullptr; }
Node* parent_node() { return parent; }
Node* create_leaf(const T& item) {
auto* node = new Node;
node->parent = this;
node->item = item;
children.push_back(node);
return node;
}
template <class Fn>
void walk(Fn&& fn) {
fn(this);
if(!is_leaf()) {
for(auto& child : children)
child->walk(fn);
}
}
template <class Pred>
Node* find_child(Pred&& predicate) {
if(predicate(this) == true)
return this;
for(auto& child : children)
if(predicate(child) == true)
return child;
return nullptr;
}
std::size_t parent_count() const {
auto* parent = this->parent;
auto parent_count = 0z;
while(parent) {
parent_count++;
parent = parent->parent;
}
return parent_count;
}
};
Tree() { root = new Node; }
~Tree() { delete root; }
// Trees are not copyable but they can move
Tree(const Tree&) = delete;
Tree(Tree&&) = default;
template <class Fn>
void walk(Fn&& fn) {
root->walk(fn);
}
Node* create_leaf(const T& item) { return root->create_leaf(item); }
Node* root_node() { return root; }
private:
Node* root;
};

35
tree_test.cpp Normal file
View file

@ -0,0 +1,35 @@
#include "tree.hpp"
#include <string>
void test_tree() {
Tree<std::string> tree;
auto* virus = tree.create_leaf("Virus");
auto* worm = tree.create_leaf("Worm");
auto* test = virus->create_leaf("test");
test->create_leaf("a");
test->create_leaf("b");
test->create_leaf("c");
test->create_leaf("884");
tree.walk([](auto* node) {
auto tab_count = node->parent_count();
auto& data = node->data();
for(auto i = 0; i < tab_count; ++i)
std::printf("\t");
if(node->is_root()) {
std::printf("(root)\n");
} else {
std::printf("%s\n", data.c_str());
}
});
}
int main() {
test_tree();
return 0;
}

150
vxheaven_parse.cpp Normal file
View file

@ -0,0 +1,150 @@
#include "vxheaven_parse.hpp"
#include <format>
#include <fstream>
#include <optional>
#include <ranges>
#include <span>
#include <string>
#include <string_view>
#include <vector>
namespace vxorg {
/// Wrapper for std::views::split.
/// Make sure line outlives the vector.
std::vector<std::string_view> split_by(const std::string& string, char delim) {
auto res = std::vector<std::string_view> {};
for(auto word : std::views::split(string, delim)) {
res.push_back(std::string_view(word));
}
return res;
}
template<class Fn>
void walk_parents_in_tree_order(VxHeavenTree::Node* node, Fn&& fn) {
std::string sample_name {};
std::vector<VxHeavenTree::Node*> parent_list {};
vxorg::VxHeavenTree::Node* parent = node->parent_node();
while(parent) {
if(parent->is_root())
break;
parent_list.push_back(parent);
parent = parent->parent_node();
}
for(auto& item : std::views::reverse(parent_list)) {
fn(item);
}
}
std::string get_sample_name(VxHeavenTree::Node* node) {
if(node == nullptr)
return "";
if(!node->data().is_sample)
return node->data().name;
std::string sample_name {};
walk_parents_in_tree_order(node, [&](auto* node) {
sample_name += std::format("{}.", node->data().name);
});
sample_name += node->data().name;
return sample_name;
}
std::filesystem::path get_sample_path(VxHeavenTree::Node* node) {
if(node == nullptr)
return {};
std::filesystem::path path;
walk_parents_in_tree_order(node, [&](auto* node) {
path /= node->data().name;
});
return path;
}
void parse_into_tree(VxHeavenTree& tree, std::istream& is) {
std::string line {};
while(std::getline(is, line)) {
auto split = split_by(line, '.');
VxHeavenTree::Node* type_leaf { nullptr };
VxHeavenTree::Node* platform_leaf { nullptr };
VxHeavenTree::Node* family_leaf { nullptr };
VxHeavenTree::Node* sample_leaf { nullptr };
if(auto* node = tree.root_node()->find_child([&](auto* node) { return node->data().name == split[0]; }); node == nullptr) {
// std::printf("making leaf for type %.*s\n", split[0].length(), split[0].data());
type_leaf = tree.create_leaf({ .name = std::string(split[0].data(), split[0].length()), .is_sample = false });
} else {
// std::printf("using existing leaf for type %.*s\n", split[0].length(), split[0].data());
type_leaf = node;
}
if(split.size() == 1) {
type_leaf->data().is_sample = true;
continue;
}
if(auto* n = type_leaf->find_child([&](auto* node) {
// auto matches = node->data().name == split[1];
// std::printf("trying to find %.*s in node %s's child %s: %s\n", split[1].length(), split[1].data(),
// type_leaf->data().name.c_str(), node->data().name.c_str(), matches ? "matches": "doesnt fucking match god damn it");
return node->data().name == split[1];
});
n == nullptr) {
// std::printf("making leaf for platform %s %.*s\n", type_leaf->data().name.c_str(), split[1].length(), split[1].data());
platform_leaf = type_leaf->create_leaf({ .name = std::string(split[1].data(), split[1].length()), .is_sample = false });
} else {
// std::printf("using existing leaf for platform %.*s\n", split[1].length(), split[1].data());
platform_leaf = n;
}
if(auto* n = platform_leaf->find_child([&](auto* node) { return node->data().name == split[2]; }); n == nullptr) {
// std::printf("making leaf for platform %s %.*s\n", type_leaf->data().name.c_str(), split[1].length(), split[1].data());
family_leaf = platform_leaf->create_leaf({ .name = std::string(split[2].data(), split[2].length()), .is_sample = false });
} else {
// std::printf("using existing leaf for platform %.*s\n", split[1].length(), split[1].data());
family_leaf = n;
}
// Handle famlies with a variantless sample inside of them
if(split.size() == 3) {
family_leaf->data().is_sample = true;
continue;
}
if(split.size() > 4) {
auto subvariants = std::span(split.data() + 3, split.size() - 3);
auto leaf = family_leaf;
for(auto& subvariant : subvariants) {
if(auto* node = leaf->find_child([&](auto* node) { return node->data().name == subvariant; }); node == nullptr) {
//printf("creating variant %.*s %.*s\n", split[2].length(), split[2].data(), subvariant.length(), subvariant.data());
leaf = leaf->create_leaf({ .name = std::string(subvariant.data(), subvariant.length()), .is_sample = false });
} else {
// existing node for a subvariant
//printf("existing variant %.*s %.*s\n", split[2].length(), split[2].data(), subvariant.length(), subvariant.data());
leaf = node;
continue;
}
}
// The last node we visit is the sample
leaf->data().is_sample = true;
} else {
auto subvariant = split[3];
if(auto* node = family_leaf->find_child([&](auto* node) { return node->data().name == subvariant; }); node == nullptr) {
family_leaf->create_leaf({ .name = std::string(subvariant.data(), subvariant.length()), .is_sample = true });
}
}
}
}
} // namespace vxorg

23
vxheaven_parse.hpp Normal file
View file

@ -0,0 +1,23 @@
#pragma once
#include <string>
#include "tree.hpp"
#include <filesystem>
namespace vxorg {
struct VxHeavenItem {
std::string name;
// True if this item is also a sample
bool is_sample;
};
using VxHeavenTree = Tree<VxHeavenItem>;
std::string get_sample_name(VxHeavenTree::Node* node);
std::filesystem::path get_sample_path(VxHeavenTree::Node* node);
void parse_into_tree(VxHeavenTree& tree, std::istream& is);
} // namespace vxorg

134
vxorg.cpp Normal file
View file

@ -0,0 +1,134 @@
#include <cstring>
#include <filesystem>
#include <fstream>
#include <indicators/progress_bar.hpp>
#include <indicators/terminal_size.hpp>
#include <string>
#include "indicators/terminal_size.hpp"
#include "threadpool.hpp"
#include "tree.hpp"
#include "vxheaven_parse.hpp"
namespace ind = indicators;
int main(int argc, char** argv) {
if(argc != 4) {
std::fprintf(stderr, "usage: %s [path to list] [source path] [destination path]\n", argv[0]);
return 1;
}
vxorg::VxHeavenTree sample_tree;
// used for os filesystem ops
ThreadPool filesystem_threadpool(4);
// Parse into the sample tree
std::ifstream ifs(argv[1]);
if(!ifs) {
char err[256]{};
strerror_r(errno, &err[0], sizeof(err)-1);
std::fprintf(stderr, "Could not open sample list \"%s\": %s", argv[1], err);
return 1;
}
vxorg::parse_into_tree(sample_tree, ifs);
std::filesystem::path unorganized_source_path = argv[2];
std::filesystem::path organized_destination_path = argv[3];
if(!std::filesystem::exists(unorganized_source_path)) {
std::fprintf(stderr, "Source path \"%s\" does not exist", argv[2]);
return 1;
}
if(!std::filesystem::exists(organized_destination_path))
std::filesystem::create_directories(organized_destination_path);
std::size_t sampleCount = 0;
// Walk the tree to get the amount of sample nodes
sample_tree.walk([&](auto* node) {
if(node->data().is_sample)
sampleCount++;
});
ind::ProgressBar bar { ind::option::BarWidth { indicators::terminal_width() - 64 },
ind::option::Start { "[" },
ind::option::Fill { "" },
ind::option::Lead { "" },
ind::option::Remainder { "-" },
ind::option::End { " ]" },
ind::option::ForegroundColor { ind::Color::red },
ind::option::FontStyles { std::vector<ind::FontStyle> { ind::FontStyle::bold } },
ind::option::MaxProgress { sampleCount } };
// Walk the tree to perform the operation
sample_tree.walk([&](auto* node) {
auto& data = node->data();
#if 0
auto tabulation_level = node->parent_count();
if(tabulation_level != 0) {
for(auto i = 0; i < tabulation_level; ++i) {
std::printf("\t");
}
}
if(node->is_root()) {
std::printf("(root)\n");
} else {
if(data.is_sample) {
std::string sample_name = vxorg::get_sample_name(node);
std::printf("%s (sample %s)\n", data.name.c_str(), sample_name.c_str());
} else {
std::printf("%s\n", data.name.c_str());
}
}
#endif
#if 1
if(!node->is_root()) {
if(data.is_sample) {
std::string sample_name = vxorg::get_sample_name(node);
// paths
auto path = organized_destination_path / vxorg::get_sample_path(node);
auto source_path = unorganized_source_path / vxorg::get_sample_name(node);
if(!std::filesystem::exists(source_path)) {
std::printf("WARNING: sample %s in tree (source disk file %s) does not exist\n", sample_name.c_str(),
source_path.string().c_str());
} else {
filesystem_threadpool.add_task([path, source_path, sample_name, &bar]() {
bar.set_option(ind::option::PostfixText { std::format("Moving {}", sample_name) });
auto dest_path = path / sample_name;
// possibly TOCTOUable but it should:tm: be fine?
if(!std::filesystem::exists(path)) {
std::filesystem::create_directories(path);
}
if(std::filesystem::exists(dest_path)) {
std::filesystem::remove(dest_path);
}
std::filesystem::rename(source_path, dest_path);
bar.tick();
});
}
}
}
#endif
});
filesystem_threadpool.shutdown();
bar.mark_as_completed();
std::printf("Done.\n");
return 0;
}