#!/bin/bash

## Copyright (C) 2026 - 2026 ENCRYPTED SUPPORT LLC <adrelanos@whonix.org>
## See the file COPYING for copying conditions.

## AI-Assisted

## Clone (or fast-forward) every public, non-archived, non-fork repo
## of a GitHub user/organization into <dest-dir>/<repo-name>/.
## Idempotent. Run `github-org-clone --help` for full usage.

set -o errexit
set -o nounset
set -o pipefail
set -o errtrace
shopt -s inherit_errexit
shopt -s shift_verbose

# shellcheck source=../libexec/developer-meta-files/github-org-lib.bsh
source /usr/libexec/developer-meta-files/github-org-lib.bsh

# shellcheck source=../../../helper-scripts/usr/libexec/helper-scripts/strings.bsh
source "${HELPER_SCRIPTS_PATH:-}"/usr/libexec/helper-scripts/strings.bsh
## Check that 'wc' works (R-083).
# shellcheck source=../../../helper-scripts/usr/libexec/helper-scripts/wc-test.sh
source "${HELPER_SCRIPTS_PATH:-}"/usr/libexec/helper-scripts/wc-test.sh

include_private='false'
include_archived='false'
include_forks='true'
include_re=''
exclude_re=''
shallow=0
url_scheme='ssh'
jobs=4
# G-030: --apply or --dry-run required.
dry_run=0
mode_set=0
verbose=0

show_help() {
  cat <<'EOF'
Clone or update every selected repo of a GitHub user/organization
into a local directory tree. Idempotent - re-running fetches and
fast-forwards each existing clone, never merging or rebasing.

Repos will be cloned to (or updated in) the current working directory
if no dest dir is specified.

Usage:
  github-org-clone --apply [OPTIONS] <source-org> [<dest-dir>]
  github-org-clone --dry-run [OPTIONS] <source-org> [<dest-dir>]

Options:
  --include-private    include private repos (default: skip)
  --include-archived   include archived repos (default: skip)
  --exclude-forks      skip fork repos        (default: include)
  --include REGEX      only fork repos whose name matches REGEX
  --exclude REGEX      skip cloning repos whose name matches REGEX
  --shallow            clone with --depth=1
  --ssh                use git@github.com:... URLs (default)
  --https              use https://github.com/... URLs
  --jobs N             parallel clones/fetches (default: 4)
  --apply              perform clones/updates (mutates local fs)
  --dry-run            report planned actions, do nothing
  -v, --verbose        print each git command
  -h, --help           show this help and exit

Auth: ${GITHUB_TOKEN} env var, or ~/.config/github-token with
permissions 0600. If fetching over SSH, the local system is also
expected to have a suitable SSH key present and configured in GitHub.
Authentication is optional for public reads over HTTPS.
EOF
}

while [ "$#" -gt 0 ]; do
  case "$1" in
    --include-private)
      include_private='true'
      shift
      ;;
    --include-archived)
      include_archived='true'
      shift
      ;;
    --exclude-forks)
      include_forks='false'
      shift
      ;;
    ## --include / --exclude can each be passed multiple times. All
    ## includes are stacked to produce a whitelist, then all excludes
    ## are stacked to blacklist previously whitelisted values.
    --include)
      [ "$#" -ge 2 ] || die 64 "missing value for --include"
      if [ -z "$2" ]; then
        shift 2
        continue
      fi
      if [ -z "${include_re}" ]; then
        include_re="(${2})"
      else
        include_re+="|(${2})"
      fi
      shift 2
      ;;
    --exclude)
      [ "$#" -ge 2 ] || die 64 "missing value for --exclude"
      if [ -z "$2" ]; then
        shift 2
        continue
      fi
      if [ -z "${exclude_re}" ]; then
        exclude_re="(${2})"
      else
        exclude_re+="|(${2})"
      fi
      shift 2
      ;;
    --shallow)
      shallow=1
      shift
      ;;
    --ssh)
      url_scheme='ssh'
      shift
      ;;
    --https)
      url_scheme='https'
      shift
      ;;
    --jobs)
      [ "$#" -ge 2 ] || die 64 "missing value for --jobs"
      ## The value stored here is validated later.
      jobs="$2"
      shift 2
      ;;
    --apply)
      [ "${mode_set}" -eq 0 ] || die 64 'conflicting mode flags; specify exactly one of --apply / --dry-run'
      mode_set=1
      shift
      ;;
    --dry-run)
      [ "${mode_set}" -eq 0 ] || die 64 'conflicting mode flags; specify exactly one of --apply / --dry-run'
      dry_run=1
      mode_set=1
      shift
      ;;
    -v|--verbose)
      verbose=1
      shift
      ;;
    -h|--help)
      show_help
      exit 0
      ;;
    --)
      shift
      break
      ;;
    -*)
      die 64 "unknown option: '$1'"
      ;;
    *)
      break
      ;;
  esac
done

if [ "$#" -lt 1 ] || [ "$#" -gt 2 ]; then
  show_help >&2
  exit 64
fi

source_org="$1"
dest_dir="${2:-.}"

[ "${mode_set}" -eq 1 ] \
   || { show_help >&2; die 64 'specify exactly one of --apply / --dry-run'; }

ghorg_require_deps
ghorg_validate_name "${source_org}" user
is_whole_number "${jobs}" \
  && [ "${jobs}" -ge 1 ] \
  && [ "${jobs}" -le 200 ] \
  || die 64 "invalid --jobs value: '${jobs}' (expected integer 1..200)"

repo_url() {
  local repo

  repo="$1"

  case "${url_scheme}" in
    ssh)
      printf '%s' "git@github.com:${source_org}/${repo}.git"
      ;;
    https)
      printf '%s' "https://github.com/${source_org}/${repo}.git"
      ;;
  esac
}

## Each call is standalone so it can be safely backgrounded.
clone_or_update_one() {
  local repo repo_dir url existing_url current_branch
  local -a clone_args

  repo="$1"

  ghorg_validate_name "${repo}" repo || return 0

  repo_dir="${dest_dir}/${repo}"
  url="$(repo_url "${repo}")"

  ## Git submodules are valid Git repositories, but have a .git file
  ## instead of a .git directory. The file points to the git database
  ## location in the superproject's .git dir. Thus we need to tolerate
  ## both .git files and dirs.
  if [ -e "${repo_dir}/.git" ]; then
    ## Origin-URL collision check: if an existing clone's origin
    ## points elsewhere (operator reused dest_dir across source orgs
    ## and a name collided), skip rather than silently fetch from
    ## the wrong upstream.
    existing_url="$(git -C "${repo_dir}" remote get-url origin 2>/dev/null || true)"
    ## Compare the URLs even if "${existing_url}" is empty. "No
    ## remote" should cause the same error as "wrong remote".
    if [ "${existing_url}" != "${url}" ]; then
      log warn "'${repo_dir}': origin is '${existing_url}', expected '${url}'; skipping (name collision?)"
      return 0
    fi

    if [ "${dry_run}" = '1' ]; then
      log notice "DRY-RUN: update ${repo_dir}"
      return 0
    fi
    [ "${verbose}" = '1' ] && log notice "update: ${repo_dir}"
    ## --ff-only refuses anything but a clean fast-forward; safer
    ## than `pull` for unattended runs where divergence would
    ## otherwise attempt a merge.
    git -C "${repo_dir}" fetch --prune --tags -- origin
    current_branch="$(git -C "${repo_dir}" symbolic-ref --short HEAD 2>/dev/null || true)"
    if [ -n "${current_branch}" ]; then
      ## @{u} is shorthand for the upstream of the current branch -
      ## immune to tag/branch name collisions.
      if ! git -C "${repo_dir}" merge --ff-only "@{u}" 2>/dev/null; then
        log warn "'${repo_dir}': '${current_branch}' not fast-forwardable, skipping merge"
      fi
    fi
  elif [ -e "${repo_dir}" ]; then
    log warn "'${repo_dir}' exists and is not a git checkout, skipping"
    return 0
  else
    if [ "${dry_run}" = '1' ]; then
      log notice "DRY-RUN: clone ${url} -> ${repo_dir}"
      return 0
    fi
    [ "${verbose}" = '1' ] && log notice "clone: ${url}"
    clone_args=( clone )
    [ "${shallow}" = '1' ] && clone_args+=( --depth=1 )
    ## G-021: block file:// and ext:: helper protocols to reduce
    ## attack surface.
    clone_args+=(
      --config protocol.file.allow=never
      --config protocol.ext.allow=never
    )
    clone_args+=( -- "${url}" "${repo_dir}" )
    ## G-020: prevents LFS smudge filter from auto-fetching
    ## large files by default.
    GIT_LFS_SKIP_SMUDGE="${GIT_LFS_SKIP_SMUDGE:-1}" git "${clone_args[@]}"
  fi
}

main() {
  local repos repo_count active repo

  mkdir -p -- "${dest_dir}"

  repos="$(ghorg_list_repos "${source_org}" \
    "${include_private}" "${include_archived}" "${include_forks}" \
    | ghorg_filter_names "${include_re}" "${exclude_re}" \
    | sort --unique)"

  if [ -z "${repos}" ]; then
    log notice 'no repos matched.'
    return 0
  fi

  repo_count="$(printf '%s\n' "${repos}" | wc --lines)"
  log notice "${repo_count} repos to process under ${dest_dir}"

  ## Bounded parallelism: keep at most ${jobs} background jobs alive.
  active=0
  while IFS= read -r repo; do
    [ -z "${repo}" ] && continue
    clone_or_update_one "${repo}" &
    active=$(( active + 1 ))
    if [ "${active}" -ge "${jobs}" ]; then
      wait -n || true
      active=$(( active - 1 ))
    fi
  done <<< "${repos}"
  wait
}

main
