The Problem

When writing my A-level Computer Science project, I had written thousands of lines of code, all of which needed to be included in a code appendix. Rather than manually copying and pasting each file, I decided to write a small and somewhat inefficient Python script to automate the process.

import os
directory = '.'
with open('output.txt', 'a') as output_file:
    for root, dirs, files in os.walk(directory):
        for file in files:
            try:
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f: output_file.write(f"File: {file}\nDirectory: {root}\n" + f.read() + '\n')
            except Exception as e: print(f"Error decoding file: {os.path.join(root, file)}: {e}")

The simple script above was able to iterate through all files in the current directory and its subdirectories, and append the contents of each file to a single output file. This was a great start, but I wanted to improve the script to make it more efficient and user-friendly. My first goal was to implement a “blacklist” file (.apdxignore), which would exclude any files matching the regex patterns in the blacklist.

Expanding the Script

After I had written the script to the point where I was happy with it, I uploaded it to GitHub and shared it with my classmates. I received a lot of positive feedback, and I was encouraged to continue developing the script. I then added more features, as requested by classmates, and below is where the script stands today.

import logging
import os
import argparse
import re

HEADER = '-' * 25 + 'Contents' + '-' * 25
FOOTER = '-' * 24 + '/Contents' + "-"*25
APDX = '.apdx'
CHUNK_SIZE = 1048576 # 1MB

def load_apdxignore():
    if os.path.exists(APDX):
        with open(APDX, 'r') as f:
            return f.read().splitlines()
    return []

def check_file(file_path, ignore_files, blacklist):
    if blacklist:
        return not any(re.search(ignore, file_path) for ignore in ignore_files)
    return any(re.search(ignore, file_path) for ignore in ignore_files)

def log_error(file_path, e):
    logging.error(f"Error decoding file: {file_path}: {e}")

def write_to_file(output_file, file, root, file_path):
    output_file.write(f"File: {file}\nDirectory: {root}\n{HEADER}\n")
    with open(file_path, 'r', encoding='utf-8') as f:
        if os.path.getsize(file_path) > CHUNK_SIZE: 
            while True:
                chunk = f.read(1024)
                if not chunk:
                    break
                output_file.write(chunk)
        else:
            output_file.write(f.read())
    output_file.write(f"\n{FOOTER}\n\n")

def process_files(directory, ignore_files, output_file, blacklist):
    for root, dirs, files in os.walk(directory):
        for file in files:
            try:
                file_path = os.path.join(root, file)
                if check_file(file_path, ignore_files, blacklist):
                    write_to_file(output_file, file, root, file_path)
            except (IOError, PermissionError) as e:
                log_error(file_path, e)
            except UnicodeDecodeError as e:
                logging.error(f"Error decoding file: {file_path}: {e}")
            except UnicodeEncodeError as e:
                logging.error(f"Error encoding file: {file_path}: {e}")

def main(blacklist=True, output_file_path="output.txt", directory=".", overwrite=True):
    logging.basicConfig(level=logging.INFO)
    logging.info(f"Parameters: apdx_operating_mode={'Blacklist' if blacklist else 'Whitelist'}, output_file_path={output_file_path}, directory={directory}, overwrite={overwrite}")
    if overwrite and os.path.exists(output_file_path):
        os.remove(output_file_path)
    ignore_files = load_apdxignore() if os.path.exists(APDX) else []
    with open(output_file_path, 'a') as output_file:
        process_files(directory, ignore_files, output_file, blacklist)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Generate an appendix.')
    parser.add_argument('--blacklist', dest='apdx_blacklist', action='store_true', help='Use .apdx as a blacklist')
    parser.add_argument('--whitelist', dest='apdx_whitelist', action='store_true', help='Use .apdx as a whitelist')
    parser.add_argument('--output', dest='output_file_path', default='output.txt', help='Output file path')
    parser.add_argument('--dir', dest='directory', default='.', help='Directory to process')
    parser.add_argument('--no-overwrite', dest='overwrite', action='store_false', help='Do not overwrite existing output file')
    args = parser.parse_args()
    blacklist = args.apdx_blacklist
    whitelist = args.apdx_whitelist
    args.apdx_blacklist = blacklist or not whitelist
    main(args.apdx_blacklist, args.output_file_path, args.directory, args.overwrite)

All of the code above is available on GitHub, and I encourage you to check it out and contribute if you have any ideas for improvements. I hope this script can be useful to others who are writing academic papers and need to include a code appendix.