Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
git-cron/repo_state.json
git-cron/config.json
42 changes: 42 additions & 0 deletions git-cron/Readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Git Repository Monitor (monitor_repos.py)

This script, `monitor_repos.py`, is designed to be executed as a cron job to continuously monitor a set of specified Git repositories.

## Purpose

The primary function of this script is twofold:
1. **Monitor Repositories:** It keeps track of a pre-defined list of Git repositories, likely checking for updates, new commits, or other relevant changes.
2. **Submit to GMT for Benchmarking:** Upon detecting certain conditions (e.g., changes in a specific repository or a new repository being added to the monitored list), it facilitates the submission of *another* designated repository to the GMT (Gemini Metrics Tool) system for benchmarking purposes. This allows for automated performance or quality analysis of a project.

## Setup

To run this script, it is highly recommended to set up a Python virtual environment to manage dependencies.

1. **Create a Virtual Environment:**
```bash
python3 -m venv venv
```
2. **Activate the Virtual Environment:**
```bash
source venv/bin/activate
```
3. **Install Dependencies:**
Install the necessary Python packages using pip:
```bash
pip install -r requirements.txt
```

## Usage (Cron Job)

Once set up, this script can be scheduled to run periodically using a cron job. An example cron entry might look like this (adjust the path to `monitor_repos.py` and the frequency as needed):

```cron
0 */4 * * * /path/to/your/git-cron/venv/bin/python /path/to/your/git-cron/monitor_repos.py >> /var/log/monitor_repos.log 2>&1
```
This example would run the script every 4 hours.

## Configuration

The script's behavior is configured via `config.json` and its state is managed by `repo_state.json`. Please refer to these files for detailed configuration options and how to manage the monitored repositories and benchmarking triggers.

In the `variables` section you can use a magig keyword `__GIT_HASH__` that will be replaced with the hash of the git commit on the branch that you are watching
22 changes: 22 additions & 0 deletions git-cron/config.json.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this not be a file that is rather a config.json.example and the actual config.json ignored in .gitignore?

"api": {
"api_url": "https://api.green-coding.io/",
"token": "DEFAULT",
"timeout": 30
},
"repos": [
{
"name": "NextCloud Master Every Commit",
"repo_to_watch": "https://github.com/nextcloud/server",
"repo_to_run": "https://github.com/green-coding-solutions/nextcloud-runner",
"machine_id": 12,
"email": "didi@green-coding.io",
"branch_to_run": "main",
"branch_to_watch": "master",
"filename": "usage_scenario_master.yml",
"variables": {
"__GMT_VAR_NCHASH__": "__GIT_HASH__"
}
}
]
}
308 changes: 308 additions & 0 deletions git-cron/monitor_repos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
#!/usr/bin/env python3

from __future__ import annotations

import argparse
from dataclasses import dataclass
import json
import os
import sys
from typing import Any, Dict, Optional

from urllib.parse import urlparse, quote_plus

import requests
import time
from datetime import datetime
import builtins

original_print = print

def print_with_timestamp(*args, **kwargs):
"""Print with timestamp prefix."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
args = (f"[{timestamp}]",) + args
original_print(*args, **kwargs)

builtins.print = print_with_timestamp

# ---- GMT submit ----

class APIEmptyResponse204(Exception):
pass


class APIError(Exception):
pass

@dataclass
class APIClient:
api_url: str
token: Optional[str] = None
timeout: int = 30

def _auth_headers(self) -> Dict[str, str]:
headers = {"Content-Type": "application/json"}
if self.token:
headers["X-Authentication"] = self.token
return headers

def _request(self, path: str, method: str = "GET", json_body: Optional[Dict[str, Any]] = None,) -> Optional[Dict[str, Any]]:
url = self.api_url.rstrip("/") + path

resp = requests.request(
method=method.upper(),
url=url,
json=json_body if json_body is not None else None,
headers=self._auth_headers(),
timeout=self.timeout,
)

if resp.status_code == 204:
raise APIEmptyResponse204("No data (HTTP 204)")
if resp.status_code == 202:
return None # Accepted

try:
resp.raise_for_status()
except requests.HTTPError as e:
try:
data = resp.json()
except Exception:
raise APIError(f"HTTP {resp.status_code}: {resp.text}") from e
err = data.get("err", data)
raise APIError(f"HTTP {resp.status_code}: {err}") from e

try:
data = resp.json()
except ValueError as e:
raise APIError(f"Expected JSON but got: {resp.text[:200]}...") from e

if isinstance(data, dict) and data.get("success") is not True:
err = data.get("err")
if isinstance(err, list) and err:
first = err[0]
msg = (first.get("msg") if isinstance(first, dict) else str(first)) or str(err)
raise APIError(msg)
raise APIError(str(err))

return data

def submit_software(self, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
# Trim string values
for k, v in list(payload.items()):
if isinstance(v, str):
payload[k] = v.strip()
return self._request("/v1/software/add", method="POST", json_body=payload)


# ---- Git helpers: get latest commit for GitHub / GitLab ----

class GitError(Exception):
pass


def get_latest_commit(repo_url: str, branch: Optional[str] = None, timeout: int = 10) -> Optional[str]:
"""
For a GitHub or GitLab repo URL, return the latest commit hash (str) on the
requested branch (default branch if none given), or None if not found.

GitHub:
https://github.com/{owner}/{repo}
-> GET https://api.github.com/repos/{owner}/{repo}/commits?per_page=1[&sha=branch]
-> data[0]['sha']

GitLab:
https://gitlab.com/{group}/{project}
-> GET https://gitlab.com/api/v4/projects/{urlencoded(group/project)}/repository/commits?per_page=1[&ref_name=branch]
-> data[0]['id']
"""
parsed = urlparse(repo_url)
host = parsed.netloc.lower()
path = parsed.path.strip("/")

if not path:
raise GitError(f"Repo URL seems incomplete: {repo_url}")

if "github.com" in host:
# path = owner/repo[/...]; we only need first two segments
parts = path.split("/")
if len(parts) < 2:
raise GitError(f"Cannot parse GitHub repo from URL: {repo_url}")
owner, repo = parts[0], parts[1]
api_url = f"https://api.github.com/repos/{owner}/{repo}/commits"
params = {"per_page": 1}
if branch:
params["sha"] = branch
try:
resp = requests.get(api_url, params=params, timeout=timeout)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you not using the APIClient here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return codes are different. In API client I want to know what fails, here I just error if GitHub goes down.

except Exception as exc:
raise GitError(f"Request to GitHub API failed: {exc}") from exc
if resp.status_code != 200:
raise GitError(f"GitHub API error {resp.status_code}: {resp.text[:200]}")
data = resp.json()
if not data:
return None
return data[0].get("sha")

elif "gitlab" in host:
# works for gitlab.com and self-hosted GitLab domains containing "gitlab"
project = quote_plus(path)
api_root = f"{parsed.scheme}://{parsed.netloc}"
api_url = f"{api_root}/api/v4/projects/{project}/repository/commits"
params = {"per_page": 1}
if branch:
params["ref_name"] = branch
try:
resp = requests.get(api_url, params=params, timeout=timeout)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you not using the APIClient here?

except Exception as exc:
raise GitError(f"Request to GitLab API failed: {exc}") from exc
if resp.status_code != 200:
raise GitError(f"GitLab API error {resp.status_code}: {resp.text[:200]}")
data = resp.json()
if not data:
return None
# GitLab uses "id" for commit hash
return data[0].get("id")

else:
raise GitError(f"Unsupported git host in URL: {repo_url} (only GitHub/GitLab supported)")


# ---- State helpers ----

def load_json(path: str) -> Dict[str, Any]:
if not os.path.isfile(path):
return {}
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
return {}


def save_json(path: str, data: Dict[str, Any]) -> None:
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, sort_keys=True)


# ---- Main monitoring logic ----

def process_repo(client: APIClient, repo_cfg: Dict[str, Any], state: Dict[str, Any], global_timeout: int) -> None:

repo_to_watch: str = repo_cfg["repo_to_watch"]
name: str = repo_cfg.get("name", repo_to_watch)
branch_to_watch: Optional[str] = repo_cfg.get("branch_to_watch", "main")

print(f"Checking repo: {name} ({repo_to_watch}:{branch_to_watch})")

try:
latest_commit = get_latest_commit(repo_to_watch, branch=branch_to_watch, timeout=global_timeout)
except GitError as e:
print(f"[ERROR] {e}")
return

if not latest_commit:
print("No commits found on remote (empty repo?). Skipping.")
return

state_key = f"{repo_to_watch}#{branch_to_watch}" if branch_to_watch else repo_to_watch
repo_state = state.get(state_key, {})
last_seen = repo_state.get("last_commit", None)

print(f" Last seen: {last_seen}")
print(f" Latest : {latest_commit}")

if last_seen == latest_commit:
print(" No new commits. Nothing to do.")
return

print(f" New commit detected. Submitting job.")

payload_base: Dict[str, Any] = {
"name": name,
"repo_url": repo_cfg["repo_to_run"],
"machine_id": repo_cfg["machine_id"],
"branch": repo_cfg.get("branch_to_run", "main"),
"filename": repo_cfg.get("filename", "usage_scenario.yml"),
"schedule_mode": "one-off",
}

if "email" in repo_cfg and repo_cfg["email"]:
payload_base["email"] = repo_cfg["email"]

vars_cfg = repo_cfg.get("variables")

if isinstance(vars_cfg, dict) and vars_cfg:
for k, v in vars_cfg.items():
if v == "__GIT_HASH__":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would document that you can use this "magic variable" in your usage scenario. It might not be directly clear from the config.json.example

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good Point. Added to the readme

vars_cfg[k] = latest_commit

payload_base["usage_scenario_variables"] = vars_cfg

try:
resp = client.submit_software(dict(payload_base))
if resp is None:
print(f"Run: Accepted (202), queued.")
else:
print(f"Run: Unexpected response: {resp}")
except APIEmptyResponse204:
print(f"Run: API returned 204 No Content.")
except APIError as e:
print(f"Run: API error: {e}")
except requests.RequestException as e:
print(f"Run: HTTP error: {e}")

# Only update state after attempting submissions
state[state_key] = {"last_commit": latest_commit}
print(f"Updated state: last_commit = {latest_commit}")


def build_arg_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
description="Monitor GitHub/GitLab repos and submit GMT jobs on new commits."
)
p.add_argument(
"--config",
default="config.json",
help="Path to JSON config file (see script docstring for structure).",
)
p.add_argument(
"--state",
default="repo_state.json",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add to .gitignore?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added

help="Path to JSON state file (will be created/updated). Default: repo_state.json",
)
return p


def main() -> None:
parser = build_arg_parser()
args = parser.parse_args()

# Load config
config = load_json(args.config)
if not config:
print(f"Failed to read config file {args.config}", file=sys.stderr)
sys.exit(1)

api_cfg = config.get("api", {})
api_url = api_cfg.get("api_url", "https://api.green-coding.io/").strip()
token = api_cfg.get("token", "DEFAULT").strip()
timeout = int(api_cfg.get("timeout", 30))

repos = config.get("repos", [])
if not repos:
print("No repos configured under config['repos'].", file=sys.stderr)
sys.exit(1)

client = APIClient(api_url=api_url, token=token, timeout=timeout)

state = load_json(args.state)

for repo_cfg in repos:
process_repo(client, repo_cfg, state, timeout)

save_json(args.state, state)

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions git-cron/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
requests==2.32.5