diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..171a8e2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +git-cron/repo_state.json +git-cron/config.json diff --git a/git-cron/Readme.txt b/git-cron/Readme.txt new file mode 100644 index 0000000..0612712 --- /dev/null +++ b/git-cron/Readme.txt @@ -0,0 +1,42 @@ +# Git Repository Monitor (monitor_repos.py) + +This script, `monitor_repos.py`, is designed to be executed as a cron job to continuously monitor a set of specified Git repositories. + +## Purpose + +The primary function of this script is twofold: +1. **Monitor Repositories:** It keeps track of a pre-defined list of Git repositories, likely checking for updates, new commits, or other relevant changes. +2. **Submit to GMT for Benchmarking:** Upon detecting certain conditions (e.g., changes in a specific repository or a new repository being added to the monitored list), it facilitates the submission of *another* designated repository to the GMT (Gemini Metrics Tool) system for benchmarking purposes. This allows for automated performance or quality analysis of a project. + +## Setup + +To run this script, it is highly recommended to set up a Python virtual environment to manage dependencies. + +1. **Create a Virtual Environment:** + ```bash + python3 -m venv venv + ``` +2. **Activate the Virtual Environment:** + ```bash + source venv/bin/activate + ``` +3. **Install Dependencies:** + Install the necessary Python packages using pip: + ```bash + pip install -r requirements.txt + ``` + +## Usage (Cron Job) + +Once set up, this script can be scheduled to run periodically using a cron job. An example cron entry might look like this (adjust the path to `monitor_repos.py` and the frequency as needed): + +```cron +0 */4 * * * /path/to/your/git-cron/venv/bin/python /path/to/your/git-cron/monitor_repos.py >> /var/log/monitor_repos.log 2>&1 +``` +This example would run the script every 4 hours. + +## Configuration + +The script's behavior is configured via `config.json` and its state is managed by `repo_state.json`. Please refer to these files for detailed configuration options and how to manage the monitored repositories and benchmarking triggers. + +In the `variables` section you can use a magig keyword `__GIT_HASH__` that will be replaced with the hash of the git commit on the branch that you are watching \ No newline at end of file diff --git a/git-cron/config.json.example b/git-cron/config.json.example new file mode 100644 index 0000000..73a50e7 --- /dev/null +++ b/git-cron/config.json.example @@ -0,0 +1,22 @@ +{ + "api": { + "api_url": "https://api.green-coding.io/", + "token": "DEFAULT", + "timeout": 30 + }, + "repos": [ + { + "name": "NextCloud Master Every Commit", + "repo_to_watch": "https://github.com/nextcloud/server", + "repo_to_run": "https://github.com/green-coding-solutions/nextcloud-runner", + "machine_id": 12, + "email": "didi@green-coding.io", + "branch_to_run": "main", + "branch_to_watch": "master", + "filename": "usage_scenario_master.yml", + "variables": { + "__GMT_VAR_NCHASH__": "__GIT_HASH__" + } + } + ] +} diff --git a/git-cron/monitor_repos.py b/git-cron/monitor_repos.py new file mode 100644 index 0000000..fdcbdf8 --- /dev/null +++ b/git-cron/monitor_repos.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +from dataclasses import dataclass +import json +import os +import sys +from typing import Any, Dict, Optional + +from urllib.parse import urlparse, quote_plus + +import requests +import time +from datetime import datetime +import builtins + +original_print = print + +def print_with_timestamp(*args, **kwargs): + """Print with timestamp prefix.""" + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + args = (f"[{timestamp}]",) + args + original_print(*args, **kwargs) + +builtins.print = print_with_timestamp + +# ---- GMT submit ---- + +class APIEmptyResponse204(Exception): + pass + + +class APIError(Exception): + pass + +@dataclass +class APIClient: + api_url: str + token: Optional[str] = None + timeout: int = 30 + + def _auth_headers(self) -> Dict[str, str]: + headers = {"Content-Type": "application/json"} + if self.token: + headers["X-Authentication"] = self.token + return headers + + def _request(self, path: str, method: str = "GET", json_body: Optional[Dict[str, Any]] = None,) -> Optional[Dict[str, Any]]: + url = self.api_url.rstrip("/") + path + + resp = requests.request( + method=method.upper(), + url=url, + json=json_body if json_body is not None else None, + headers=self._auth_headers(), + timeout=self.timeout, + ) + + if resp.status_code == 204: + raise APIEmptyResponse204("No data (HTTP 204)") + if resp.status_code == 202: + return None # Accepted + + try: + resp.raise_for_status() + except requests.HTTPError as e: + try: + data = resp.json() + except Exception: + raise APIError(f"HTTP {resp.status_code}: {resp.text}") from e + err = data.get("err", data) + raise APIError(f"HTTP {resp.status_code}: {err}") from e + + try: + data = resp.json() + except ValueError as e: + raise APIError(f"Expected JSON but got: {resp.text[:200]}...") from e + + if isinstance(data, dict) and data.get("success") is not True: + err = data.get("err") + if isinstance(err, list) and err: + first = err[0] + msg = (first.get("msg") if isinstance(first, dict) else str(first)) or str(err) + raise APIError(msg) + raise APIError(str(err)) + + return data + + def submit_software(self, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]: + # Trim string values + for k, v in list(payload.items()): + if isinstance(v, str): + payload[k] = v.strip() + return self._request("/v1/software/add", method="POST", json_body=payload) + + +# ---- Git helpers: get latest commit for GitHub / GitLab ---- + +class GitError(Exception): + pass + + +def get_latest_commit(repo_url: str, branch: Optional[str] = None, timeout: int = 10) -> Optional[str]: + """ + For a GitHub or GitLab repo URL, return the latest commit hash (str) on the + requested branch (default branch if none given), or None if not found. + + GitHub: + https://github.com/{owner}/{repo} + -> GET https://api.github.com/repos/{owner}/{repo}/commits?per_page=1[&sha=branch] + -> data[0]['sha'] + + GitLab: + https://gitlab.com/{group}/{project} + -> GET https://gitlab.com/api/v4/projects/{urlencoded(group/project)}/repository/commits?per_page=1[&ref_name=branch] + -> data[0]['id'] + """ + parsed = urlparse(repo_url) + host = parsed.netloc.lower() + path = parsed.path.strip("/") + + if not path: + raise GitError(f"Repo URL seems incomplete: {repo_url}") + + if "github.com" in host: + # path = owner/repo[/...]; we only need first two segments + parts = path.split("/") + if len(parts) < 2: + raise GitError(f"Cannot parse GitHub repo from URL: {repo_url}") + owner, repo = parts[0], parts[1] + api_url = f"https://api.github.com/repos/{owner}/{repo}/commits" + params = {"per_page": 1} + if branch: + params["sha"] = branch + try: + resp = requests.get(api_url, params=params, timeout=timeout) + except Exception as exc: + raise GitError(f"Request to GitHub API failed: {exc}") from exc + if resp.status_code != 200: + raise GitError(f"GitHub API error {resp.status_code}: {resp.text[:200]}") + data = resp.json() + if not data: + return None + return data[0].get("sha") + + elif "gitlab" in host: + # works for gitlab.com and self-hosted GitLab domains containing "gitlab" + project = quote_plus(path) + api_root = f"{parsed.scheme}://{parsed.netloc}" + api_url = f"{api_root}/api/v4/projects/{project}/repository/commits" + params = {"per_page": 1} + if branch: + params["ref_name"] = branch + try: + resp = requests.get(api_url, params=params, timeout=timeout) + except Exception as exc: + raise GitError(f"Request to GitLab API failed: {exc}") from exc + if resp.status_code != 200: + raise GitError(f"GitLab API error {resp.status_code}: {resp.text[:200]}") + data = resp.json() + if not data: + return None + # GitLab uses "id" for commit hash + return data[0].get("id") + + else: + raise GitError(f"Unsupported git host in URL: {repo_url} (only GitHub/GitLab supported)") + + +# ---- State helpers ---- + +def load_json(path: str) -> Dict[str, Any]: + if not os.path.isfile(path): + return {} + try: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception: + return {} + + +def save_json(path: str, data: Dict[str, Any]) -> None: + with open(path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, sort_keys=True) + + +# ---- Main monitoring logic ---- + +def process_repo(client: APIClient, repo_cfg: Dict[str, Any], state: Dict[str, Any], global_timeout: int) -> None: + + repo_to_watch: str = repo_cfg["repo_to_watch"] + name: str = repo_cfg.get("name", repo_to_watch) + branch_to_watch: Optional[str] = repo_cfg.get("branch_to_watch", "main") + + print(f"Checking repo: {name} ({repo_to_watch}:{branch_to_watch})") + + try: + latest_commit = get_latest_commit(repo_to_watch, branch=branch_to_watch, timeout=global_timeout) + except GitError as e: + print(f"[ERROR] {e}") + return + + if not latest_commit: + print("No commits found on remote (empty repo?). Skipping.") + return + + state_key = f"{repo_to_watch}#{branch_to_watch}" if branch_to_watch else repo_to_watch + repo_state = state.get(state_key, {}) + last_seen = repo_state.get("last_commit", None) + + print(f" Last seen: {last_seen}") + print(f" Latest : {latest_commit}") + + if last_seen == latest_commit: + print(" No new commits. Nothing to do.") + return + + print(f" New commit detected. Submitting job.") + + payload_base: Dict[str, Any] = { + "name": name, + "repo_url": repo_cfg["repo_to_run"], + "machine_id": repo_cfg["machine_id"], + "branch": repo_cfg.get("branch_to_run", "main"), + "filename": repo_cfg.get("filename", "usage_scenario.yml"), + "schedule_mode": "one-off", + } + + if "email" in repo_cfg and repo_cfg["email"]: + payload_base["email"] = repo_cfg["email"] + + vars_cfg = repo_cfg.get("variables") + + if isinstance(vars_cfg, dict) and vars_cfg: + for k, v in vars_cfg.items(): + if v == "__GIT_HASH__": + vars_cfg[k] = latest_commit + + payload_base["usage_scenario_variables"] = vars_cfg + + try: + resp = client.submit_software(dict(payload_base)) + if resp is None: + print(f"Run: Accepted (202), queued.") + else: + print(f"Run: Unexpected response: {resp}") + except APIEmptyResponse204: + print(f"Run: API returned 204 No Content.") + except APIError as e: + print(f"Run: API error: {e}") + except requests.RequestException as e: + print(f"Run: HTTP error: {e}") + + # Only update state after attempting submissions + state[state_key] = {"last_commit": latest_commit} + print(f"Updated state: last_commit = {latest_commit}") + + +def build_arg_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + description="Monitor GitHub/GitLab repos and submit GMT jobs on new commits." + ) + p.add_argument( + "--config", + default="config.json", + help="Path to JSON config file (see script docstring for structure).", + ) + p.add_argument( + "--state", + default="repo_state.json", + help="Path to JSON state file (will be created/updated). Default: repo_state.json", + ) + return p + + +def main() -> None: + parser = build_arg_parser() + args = parser.parse_args() + + # Load config + config = load_json(args.config) + if not config: + print(f"Failed to read config file {args.config}", file=sys.stderr) + sys.exit(1) + + api_cfg = config.get("api", {}) + api_url = api_cfg.get("api_url", "https://api.green-coding.io/").strip() + token = api_cfg.get("token", "DEFAULT").strip() + timeout = int(api_cfg.get("timeout", 30)) + + repos = config.get("repos", []) + if not repos: + print("No repos configured under config['repos'].", file=sys.stderr) + sys.exit(1) + + client = APIClient(api_url=api_url, token=token, timeout=timeout) + + state = load_json(args.state) + + for repo_cfg in repos: + process_repo(client, repo_cfg, state, timeout) + + save_json(args.state, state) + +if __name__ == "__main__": + main() diff --git a/git-cron/requirements.txt b/git-cron/requirements.txt new file mode 100644 index 0000000..1e1bdeb --- /dev/null +++ b/git-cron/requirements.txt @@ -0,0 +1 @@ +requests==2.32.5 \ No newline at end of file