Start of rewrite

New things:

- Multiple Python modules (I might rewrite this in C++ or something)
- This uses a tree data structure now, which is much less rickity
- Parsing is much less rickity too
This commit is contained in:
Lily Tsuru 2024-10-21 10:56:44 -04:00
parent 02c2d520c5
commit a100d6e508
4 changed files with 255 additions and 31 deletions

82
tree.py Normal file
View file

@ -0,0 +1,82 @@
# node of the tree
class TreeNode:
def __init__(self):
self.item = None # Can be anything (a string currently)
# Only one parent can exist
self.parent = None
self.is_sample = False
self.children = []
# creates a child node
def create_leaf(self):
node = TreeNode()
node.parent = self
self.children.append(node)
return node
# returns true if this node is an "leaf" node, i.e:
# it has no children to continue with
def is_leaf(self):
if len(self.children) == 0:
return True
return False
def create_child_leaf(self, item):
node = self.create_leaf()
node.item = item
return node
def walk_children(self, fn):
fn(self)
# walk children nodes if this node isn't an leaf node
if not self.is_leaf():
for node in self.children:
node.walk_children(fn)
# find a single child. Returns None if no child with name ecists
def find_child(self, name):
if self.item == name:
return self
for node in self.children:
if node.item == name:
return node
return None
def get_parent_count(self):
# Walk the parents to figure out current tree depth
# and how many
parent = self.parent
parent_count = 0
while parent is not None:
parent_count += 1
parent = parent.parent
return parent_count
# def walk_children(self, fn):
# fn(self)
# for node in self.children:
# fn(node)
# for other_node in node.children:
# fn(other_node)
class Tree:
def __init__(self):
# create a root node
self.root = TreeNode()
#self.root.item = '(root)'
def walk(self, fn):
self.root.walk_children(fn)
#for node in self.root.children:
# node.walk_children(fn)
#fn(node)
#for other_node in node.children:
# fn(other_node)
# creates a leaf in the root node and populates it
def create_leaf(self, item):
leaf_node = self.root.create_child()
leaf_node.item = item
return leaf_node

48
tree_test.py Normal file
View file

@ -0,0 +1,48 @@
import tree
the_tree = tree.Tree()
# create leaf node
virus = the_tree.create_leaf('Virus')
virus2 = the_tree.create_leaf('Worm')
# create child leaf
test = virus.create_child_leaf('Test')
# create test items inside of 'test' leaf
v1 = test.create_child_leaf('a')
v2 = test.create_child_leaf('b')
v3 = test.create_child_leaf('c')
v4 = test.create_child_leaf('884')
def walk_cb(node):
ident = ''
# Walk the parents to figure out current tree depth
# and how many
parent_list = []
parent = node.parent
parent_count = 0
while parent is not None:
if parent.item is not None:
parent_list.append(parent.item)
parent_count += 1
parent = parent.parent
#for item in reversed(parent_list):
# ident += f'{item}.'
if node.item is not None:
ident += node.item
else:
ident = '[root]'
tab = ''
for i in range(0, parent_count):
tab += '\t'
print(f"{tab}{ident}")
the_tree.walk(walk_cb)

80
vxheaven_parse.py Normal file
View file

@ -0,0 +1,80 @@
# Basically we spit out a tree that looks like:
# | (root)
# | Virus
# | Boot
# | Marburg (samples with only one child do not create a leaf node, so they will)
# | DOS
# | Jerusalem
# | 664
# | Crypt.1808
# | Win9x
# | ...
#
# Basically, we build the tree, then walk it to execute the organize operation.
# The tree knows its parents, so to create names we can simply walk the tree backwards.
# Additionally, every sample is indicated as a leaf node
# (nevermind, this only partially holds true, so we DO have to introduce oob info for it :()
# Parses a text file containing VXHeaven sample identifiers into the tree
# [sample_tree].
def parse_into_tree(sample_tree, path):
listfile = open(path, 'r')
for line in listfile:
line = line.strip()
split = line.split(".")
#print(f'split: {split}')
type_leaf = None
platform_leaf = None
family_leaf = None
variant_leaf = None
# type ('Virus', 'Worm' so on)
if sample_tree.root.find_child(split[0]) is None:
#print(f'creating leaf for type {split[0]}')
type_leaf = sample_tree.create_leaf(split[0])
else:
type_leaf = sample_tree.root.find_child(split[0])
# no more items
if len(split) == 1:
type_leaf.is_sample = True
continue
# platform ('DOS', 'Win32' so on)
if type_leaf.find_child(split[1]) is None:
#print(f'creating leaf for type & platform {split[0]}.{split[1]}')
platform_leaf = type_leaf.create_child_leaf(split[1])
else:
platform_leaf = type_leaf.find_child(split[1])
# family
if platform_leaf.find_child(split[2]) is None:
#print(f'creating leaf for type & platform & family {split[0]}.{split[1]}.{split[2]}')
family_leaf = platform_leaf.create_child_leaf(split[2])
else:
family_leaf = platform_leaf.find_child(split[2])
# Hack, kind of.
# Some families have a sample without subvariants, then the subvariants;
# this handles that case in a relatively quick way that doesn't involve
# recursing into the tree a bunch.
if len(split) == 3:
family_leaf.is_sample = True
continue
# Subvariant
if len(split) > 4:
subvariants = split[3:]
leaf = family_leaf
for var in subvariants:
if leaf.find_child(var) is None:
leaf = leaf.create_child_leaf(var)
# The last node we arrive to is the sample's node
leaf.is_sample = True
else:
subvariant = split[3]
if family_leaf.find_child(subvariant) is None:
leaf = family_leaf.create_child_leaf(split[3])
leaf.is_sample = True

View file

@ -3,7 +3,7 @@
# simple script to organize the vxheaven collection
#
# Usage:
# Create input list with "ls > ../list" ran inside where you extracted the vxheaven archive
# Create input list
# Run this script
# ... Watch it go?
@ -13,43 +13,57 @@ import sys
from pathlib import Path, PurePath
class VxFolderInfo:
def __init__(self, vtype, platform, family, filename):
self.vtype = vtype
self.platform = platform
self.family = family
self.filename = filename
import tree
import vxheaven_parse
def MakePath(self):
return Path(os.getcwd()) / self.vtype / self.platform / self.family
# tree used to hold samples
sample_tree = tree.Tree()
def main():
folderInfo = []
listfile = open('../list', 'r')
def get_sample_name_from_tree_node(node):
sample_name = ''
if not node.is_sample:
return sample_name
for line in listfile:
line = line.strip()
split = line.split(".")
parent_list = []
parent = node.parent
while parent is not None:
# reached the root node of the tree
if parent.item is None:
break
parent_list.append(parent.item)
parent = parent.parent
try:
folderInfo.append(VxFolderInfo(split[0], split[1], split[2], line))
except:
print(f'invalid format for {split}/{line}')
for item in reversed(parent_list):
sample_name += f'{item}.'
listfile.close()
sample_name += node.item
return sample_name
for item in folderInfo:
srcPath = Path(item.filename)
dstPath = item.MakePath()
# python doesn't have true anonymous functions.
# I really regret writing this in python but i've sunk too much
# in to rewrite this in C++ or something
def walk_cb(node):
node_name = '(root)'
sample_name = get_sample_name_from_tree_node(node)
if not dstPath.is_dir():
print(f'making directory tree {str(dstPath)}')
dstPath.mkdir(parents=True)
if node.item is not None:
node_name = node.item
if srcPath.is_file():
newDst = dstPath / item.filename
print(f'moving {str(srcPath)} to {str(newDst)}')
srcPath.rename(newDst)
# Tab the list for clarity
tab = ''
for i in range(0, node.get_parent_count()):
tab += '\t'
main()
leaf = ''
if node.is_sample:
leaf = f' (sample {sample_name})'
print(f"{tab}{node_name}{leaf}")
# Parse sample tree from vxheaven list
vxheaven_parse.parse_into_tree(sample_tree, './samples.sort')
# Walk the sample tree (currently, just dumps it for debugging)
sample_tree.walk(walk_cb)