OCCT/.github/actions/scripts/cleanup-includes.py

#!/usr/bin/env python3
# Copyright (c) 2025 OPEN CASCADE SAS
#
# This file is part of Open CASCADE Technology software library.
#
# This library is free software; you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License version 2.1 as published
# by the Free Software Foundation, with special exception defined in the file
# OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
# distribution for complete text of the license and disclaimer of any warranty.
#
# Alternatively, this file may be used under the terms of Open CASCADE
# commercial license or contractual agreement.

"""
Script to clean up duplicate #include directives and self-includes in C++ source files.
Removes:
1. Duplicate #include statements (keeps only the first occurrence)
2. Self-includes (e.g., Foo.hxx including "Foo.hxx")

Processes: .cxx, .hxx, .pxx, .lxx, .gxx files
"""

import os
import re
from pathlib import Path
from typing import List, Tuple


def get_filename_without_extension(filepath: str) -> str:
    """Get the filename without extension."""
    return Path(filepath).stem


def process_file(filepath: str, dry_run: bool = False) -> Tuple[bool, int, bool]:
    """
    Process a single file to remove duplicate includes and self-includes.

    Returns:
        Tuple of (modified, num_duplicates_removed, had_self_include)
    """
    try:
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return False, 0, False

    # Pattern to match #include directives and preprocessor directives
    include_pattern = re.compile(r'^\s*#\s*include\s+[<"]([^>"]+)[>"]')
    preprocessor_pattern = re.compile(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)')

    ifdef_stack = [set()]  # Stack of tuples: (parent_scope_includes, current_branch_includes)
    new_lines = []
    duplicates_removed = 0
    had_self_include = False
    in_block_comment = False

    base_filename = get_filename_without_extension(filepath)
    file_extension = Path(filepath).suffix

    for line in lines:
        # Track multi-line comments
        if '/*' in line:
            in_block_comment = True
        if '*/' in line:
            in_block_comment = False
            new_lines.append(line)
            continue

        # Skip lines in block comments or single-line comments
        stripped = line.lstrip()
        if in_block_comment or stripped.startswith('//'):
            new_lines.append(line)
            continue

        # Track preprocessor scope changes
        prep_match = preprocessor_pattern.match(line)
        if prep_match:
            directive = prep_match.group(1)
            if directive in ('if', 'ifdef', 'ifndef'):
                # Start new scope, inheriting parent scope's includes
                parent_includes = ifdef_stack[-1].copy() if ifdef_stack else set()
                ifdef_stack.append(parent_includes)
            elif directive == 'endif':
                # End scope
                if len(ifdef_stack) > 1:
                    ifdef_stack.pop()
            elif directive in ('elif', 'else'):
                # Alternative branches - reset to empty scope (don't inherit sibling branch)
                # Only keep includes from before the entire #if block started
                if len(ifdef_stack) > 1:
                    # Get the scope from before this #if block (grandparent)
                    grandparent_includes = ifdef_stack[-2].copy() if len(ifdef_stack) > 1 else set()
                    ifdef_stack[-1] = grandparent_includes
                else:
                    ifdef_stack[-1] = set()
            new_lines.append(line)
            continue

        match = include_pattern.match(line)

        if match:
            included_file = match.group(1)

            # Skip malformed includes (e.g., empty or just extension)
            if not included_file or included_file.startswith('.') or len(included_file) <= 4:
                new_lines.append(line)
                continue

            included_basename = Path(included_file).stem
            included_extension = Path(included_file).suffix

            # Skip .gxx and .pxx includes from duplicate checking
            # These are template implementation files that may be intentionally included multiple times
            if included_extension in ('.gxx', '.pxx'):
                new_lines.append(line)
                continue

            # Check for self-include (same name AND same extension)
            if included_basename == base_filename and included_extension == file_extension:
                had_self_include = True
                print(f"  Removing self-include: {line.strip()}")
                continue

            # Check for duplicate only in current scope
            current_scope = ifdef_stack[-1]
            if included_file in current_scope:
                duplicates_removed += 1
                print(f"  Removing duplicate: {line.strip()}")
                continue

            current_scope.add(included_file)

        new_lines.append(line)

    # Check if file was modified
    modified = (len(new_lines) != len(lines))

    if modified and not dry_run:
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                f.writelines(new_lines)
        except Exception as e:
            print(f"Error writing {filepath}: {e}")
            return False, 0, False

    return modified, duplicates_removed, had_self_include


def find_files(root_dir: str, extensions: List[str]) -> List[str]:
    """Find all files with specified extensions in the directory tree."""
    files = []
    for ext in extensions:
        files.extend(Path(root_dir).rglob(f"*{ext}"))
    return [str(f) for f in files]


def main():
    import argparse

    parser = argparse.ArgumentParser(
        description='Clean up duplicate #include directives and self-includes in C++ files'
    )
    parser.add_argument(
        'path',
        nargs='?',
        default='src',
        help='Root directory to process (default: src)'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be changed without modifying files'
    )
    parser.add_argument(
        '--extensions',
        nargs='+',
        default=['.cxx', '.hxx', '.pxx', '.lxx', '.gxx'],
        help='File extensions to process (default: .cxx .hxx .pxx .lxx .gxx)'
    )
    parser.add_argument(
        '--files',
        nargs='+',
        help='Specific files to process (overrides path scanning)'
    )
    parser.add_argument(
        '--file-list',
        help='Path to a file containing list of files to process (one per line)'
    )

    args = parser.parse_args()

    if args.file_list:
        # Read files from a list file
        try:
            with open(args.file_list, 'r', encoding='utf-8') as f:
                file_paths = [line.strip() for line in f if line.strip()]
            files = [os.path.abspath(f) for f in file_paths if os.path.isfile(f)]
        except Exception as e:
            print(f"Error reading file list: {e}")
            return 1
        if len(files) == 0:
            print("No valid files found in file list")
            return 0
    elif args.files:
        # Process specific files (single file mode - minimal output)
        files = [os.path.abspath(f) for f in args.files if os.path.isfile(f)]
    else:
        # Scan directory (batch mode - verbose output)
        root_dir = os.path.abspath(args.path)

        if not os.path.isdir(root_dir):
            print(f"Error: {root_dir} is not a directory")
            return 1

        print(f"Scanning for files in: {root_dir}")
        print(f"Extensions: {', '.join(args.extensions)}")
        files = find_files(root_dir, args.extensions)
        print(f"Found {len(files)} files to process")

    if args.dry_run:
        print("DRY RUN MODE - No files will be modified\n")

    total_modified = 0
    total_duplicates = 0
    total_self_includes = 0
    single_file_mode = len(files) == 1

    for filepath in sorted(files):
        modified, duplicates, self_include = process_file(filepath, args.dry_run)

        if modified:
            total_modified += 1
            total_duplicates += duplicates
            if self_include:
                total_self_includes += 1

            if not single_file_mode:
                print(f"Modified: {filepath}")
                if duplicates > 0:
                    print(f"  - Removed {duplicates} duplicate include(s)")
                if self_include:
                    print(f"  - Removed self-include")
                print()

    # Only show summary in batch mode
    if not single_file_mode:
        print("\n" + "="*70)
        print("SUMMARY")
        print("="*70)
        print(f"Files processed: {len(files)}")
        print(f"Files modified: {total_modified}")
        print(f"Duplicate includes removed: {total_duplicates}")
        print(f"Files with self-includes fixed: {total_self_includes}")

        if args.dry_run:
            print("\nThis was a dry run. Use without --dry-run to apply changes.")

    return 0


if __name__ == '__main__':
    exit(main())