Skip to content

Refactor/ingestion #209

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Mar 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,6 @@ Caddyfile

# ignore default output directory
tmp/*

# Gitingest
digest.txt
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ repos:
files: ^src/
additional_dependencies:
[
chardet,
click,
fastapi-analytics,
pytest-asyncio,
Expand All @@ -112,6 +113,7 @@ repos:
- --rcfile=tests/.pylintrc
additional_dependencies:
[
chardet,
click,
fastapi-analytics,
pytest,
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ Gitingest aims to be friendly for first time contributors, with a simple Python
- [tiktoken](https://github.com/openai/tiktoken) - Token estimation
- [posthog](https://github.com/PostHog/posthog) - Amazing analytics

### Looking for a JavaScript/Node package?
### Looking for a JavaScript/FileSystemNode package?

Check out the NPM alternative 📦 Repomix: <https://github.com/yamadashy/repomix>

Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
[project]
name = "gitingest"
version = "0.1.3"
version = "0.1.4"
description="CLI tool to analyze and create text dumps of codebases for LLMs"
readme = {file = "README.md", content-type = "text/markdown" }
requires-python = ">= 3.8"
dependencies = [
"click>=8.0.0",
"tiktoken",
"tomli",
"typing_extensions; python_version < '3.10'",
]

Expand Down Expand Up @@ -52,6 +53,7 @@ disable = [
"too-few-public-methods",
"broad-exception-caught",
"duplicate-code",
"fixme",
]

[tool.pycln]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
chardet
click>=8.0.0
fastapi[standard]
python-dotenv
Expand Down
8 changes: 4 additions & 4 deletions src/gitingest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
""" Gitingest: A package for ingesting data from Git repositories. """

from gitingest.query_ingestion import run_ingest_query
from gitingest.query_parser import parse_query
from gitingest.repository_clone import clone_repo
from gitingest.cloning import clone_repo
from gitingest.ingestion import ingest_query
from gitingest.query_parsing import parse_query
from gitingest.repository_ingest import ingest, ingest_async

__all__ = ["run_ingest_query", "clone_repo", "parse_query", "ingest", "ingest_async"]
__all__ = ["ingest_query", "clone_repo", "parse_query", "ingest", "ingest_async"]
8 changes: 4 additions & 4 deletions src/gitingest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import click

from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_PATH
from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME
from gitingest.repository_ingest import ingest_async


Expand Down Expand Up @@ -92,15 +92,15 @@ async def _async_main(
include_patterns = set(include_pattern)

if not output:
output = OUTPUT_FILE_PATH
output = OUTPUT_FILE_NAME
summary, _, _ = await ingest_async(source, max_size, include_patterns, exclude_patterns, branch, output=output)

click.echo(f"Analysis complete! Output written to: {output}")
click.echo("\nSummary:")
click.echo(summary)

except Exception as e:
click.echo(f"Error: {e}", err=True)
except Exception as exc:
click.echo(f"Error: {exc}", err=True)
raise click.Abort()


Expand Down
16 changes: 10 additions & 6 deletions src/gitingest/repository_clone.py → src/gitingest/cloning.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path
from typing import List, Optional, Tuple

from gitingest.utils import async_timeout
from gitingest.utils.timeout_wrapper import async_timeout

TIMEOUT: int = 60

Expand Down Expand Up @@ -38,6 +38,7 @@ class CloneConfig:
commit: Optional[str] = None
branch: Optional[str] = None
subpath: str = "/"
blob: bool = False


@async_timeout(TIMEOUT)
Expand Down Expand Up @@ -72,14 +73,15 @@ async def clone_repo(config: CloneConfig) -> None:
parent_dir = Path(local_path).parent
try:
os.makedirs(parent_dir, exist_ok=True)
except OSError as e:
raise OSError(f"Failed to create parent directory {parent_dir}: {e}") from e
except OSError as exc:
raise OSError(f"Failed to create parent directory {parent_dir}: {exc}") from exc

# Check if the repository exists
if not await _check_repo_exists(url):
raise ValueError("Repository not found, make sure it is public")

clone_cmd = ["git", "clone", "--recurse-submodules", "--single-branch"]
clone_cmd = ["git", "clone", "--single-branch"]
# TODO re-enable --recurse-submodules

if partial_clone:
clone_cmd += ["--filter=blob:none", "--sparse"]
Expand All @@ -98,7 +100,10 @@ async def clone_repo(config: CloneConfig) -> None:
checkout_cmd = ["git", "-C", local_path]

if partial_clone:
checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")]
if config.blob:
checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")[:-1]]
else:
checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")]

if commit:
checkout_cmd += ["checkout", commit]
Expand Down Expand Up @@ -149,7 +154,6 @@ async def _check_repo_exists(url: str) -> bool:
raise RuntimeError(f"Unexpected status code: {status_code}")


@async_timeout(TIMEOUT)
async def fetch_remote_branch_list(url: str) -> List[str]:
"""
Fetch the list of branches from a remote Git repository.
Expand Down
2 changes: 1 addition & 1 deletion src/gitingest/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@
MAX_FILES = 10_000 # Maximum number of files to process
MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB

OUTPUT_FILE_PATH = "digest.txt"
OUTPUT_FILE_NAME = "digest.txt"

TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest"
143 changes: 143 additions & 0 deletions src/gitingest/filesystem_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
""" Define the schema for the filesystem representation. """

from __future__ import annotations

import os
from dataclasses import dataclass, field
from enum import Enum, auto
from pathlib import Path

from gitingest.exceptions import InvalidNotebookError
from gitingest.utils.ingestion_utils import _get_encoding_list
from gitingest.utils.notebook_utils import process_notebook
from gitingest.utils.textfile_checker_utils import is_textfile

SEPARATOR = "=" * 48 + "\n"


class FileSystemNodeType(Enum):
"""Enum representing the type of a file system node (directory or file)."""

DIRECTORY = auto()
FILE = auto()


@dataclass
class FileSystemStats:
"""Class for tracking statistics during file system traversal."""

visited: set[Path] = field(default_factory=set)
total_files: int = 0
total_size: int = 0


@dataclass
class FileSystemNode: # pylint: disable=too-many-instance-attributes
"""
Class representing a node in the file system (either a file or directory).

This class has more than the recommended number of attributes because it needs to
track various properties of files and directories for comprehensive analysis.
"""

name: str
type: FileSystemNodeType # e.g., "directory" or "file"
path_str: str
path: Path
size: int = 0
file_count: int = 0
dir_count: int = 0
depth: int = 0
children: list[FileSystemNode] = field(default_factory=list) # Using default_factory instead of empty list

def sort_children(self) -> None:
"""
Sort the children nodes of a directory according to a specific order.

Order of sorting:
1. README.md first
2. Regular files (not starting with dot)
3. Hidden files (starting with dot)
4. Regular directories (not starting with dot)
5. Hidden directories (starting with dot)
All groups are sorted alphanumerically within themselves.
"""
# Separate files and directories
files = [child for child in self.children if child.type == FileSystemNodeType.FILE]
directories = [child for child in self.children if child.type == FileSystemNodeType.DIRECTORY]

# Find README.md
readme_files = [f for f in files if f.name.lower() == "readme.md"]
other_files = [f for f in files if f.name.lower() != "readme.md"]

# Separate hidden and regular files/directories
regular_files = [f for f in other_files if not f.name.startswith(".")]
hidden_files = [f for f in other_files if f.name.startswith(".")]
regular_dirs = [d for d in directories if not d.name.startswith(".")]
hidden_dirs = [d for d in directories if d.name.startswith(".")]

# Sort each group alphanumerically
regular_files.sort(key=lambda x: x.name)
hidden_files.sort(key=lambda x: x.name)
regular_dirs.sort(key=lambda x: x.name)
hidden_dirs.sort(key=lambda x: x.name)

self.children = readme_files + regular_files + hidden_files + regular_dirs + hidden_dirs

@property
def content_string(self) -> str:
"""
Return the content of the node as a string.

This property returns the content of the node as a string, including the path and content.

Returns
-------
str
A string representation of the node's content.
"""
content_repr = SEPARATOR

# Use forward slashes in output paths
content_repr += f"File: {str(self.path_str).replace(os.sep, '/')}\n"
content_repr += SEPARATOR
content_repr += f"{self.content}\n\n"
return content_repr

@property
def content(self) -> str: # pylint: disable=too-many-return-statements
"""
Read the content of a file.

This function attempts to open a file and read its contents using UTF-8 encoding.
If an error occurs during reading (e.g., file is not found or permission error),
it returns an error message.

Returns
-------
str
The content of the file, or an error message if the file could not be read.
"""
if self.type == FileSystemNodeType.FILE and not is_textfile(self.path):
return "[Non-text file]"

try:
if self.path.suffix == ".ipynb":
try:
return process_notebook(self.path)
except Exception as exc:
return f"Error processing notebook: {exc}"

for encoding in _get_encoding_list():
try:
with self.path.open(encoding=encoding) as f:
return f.read()
except UnicodeDecodeError:
continue
except OSError as exc:
return f"Error reading file: {exc}"

return "Error: Unable to decode file with available encodings"

except (OSError, InvalidNotebookError) as exc:
return f"Error reading file: {exc}"
Loading
Loading