Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 243 additions & 0 deletions lib/ingestors/github_ingestor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
# frozen_string_literal: true

require 'open-uri'
require 'json'
require 'httparty'
require 'nokogiri'

module Ingestors
# GithubIngestor fetches repository information from GitHub to populate the materials' metadata.
# API requests counter:
# 1. Get the repo's general metadata #{GITHUB_API_BASE}/#{full_name}
# and keys: name, full_name, owner.login, html_url, description,
# homepage, topics, license.{key, spdx}, archived,
# created_at, pushed_at, updated_at, contributors_url
# 2. Get the doi #{GITHUB_API_BASE}/#{full_name}/contents/README.md
# and key: content
# 3. Get the version/release #{GITHUB_API_BASE}/#{full_name}/releases
# and key: tag_name (first)
# 4. Get the contributors' list #{GITHUB_API_BASE}/#{full_name}/contributors
# and key: login (from all entries)
class GithubIngestor < Ingestor # rubocop:disable Metrics/ClassLength
include Ingestors::Concerns::SitemapHelpers

GITHUB_API_BASE = 'https://api.github.com/repos'
CACHE_PREFIX = 'github_ingestor_'
TTL = 1.week # cache expiration time (time to live before cache expires)

def self.config
{
key: 'github',
title: 'GitHub Repository or Page',
category: :materials,
user_agent: 'TeSS Github ingestor'
}
end

# Reads from direct GitHub URLs, .xml sitemaps, and .txt sitemaps.
# Fetches repository metadata, contributors, releases, and DOIs (from CITATION.cff).
# It handles automatically GitHub Pages URLs (github.io) and standard github.com URLs.
# It caches API requests to avoid repeated calls.
def read(source_url)
@verbose = false
# Returns either a map of unique URL entries, either the URL itself
sources = parse_sitemap(source_url)

sources.each do |url|
# Reads each source, if github.{com|io}, gets the repo's api, if not, next
repo_api_url = to_github_api(url)
next unless repo_api_url

# Gets the cached repo data or reads and sets it
key = "#{CACHE_PREFIX}#{repo_api_url.gsub(%r{https?://}, '').gsub('/', '_')}"
repo_data = cache_fetch(key, repo_api_url)
next unless repo_data

# Add to material
add_material to_material(repo_data)
end
rescue StandardError => e
Rails.logger.error("#{e.class}: read() failed, #{e.message}")
end

private

# Takes a github.{com|io} url and returns its api.github.com url
def to_github_api(url)
uri = URI(url)
parts = uri.path.split('/') # 'example.com/foo/bar' will have path == '/foo/bar', so three parts

# http(s)://github.com/<username>/<repo> is the strict way to pass
if uri.host&.downcase == 'github.com' && (uri.host.count('.') == 1) && parts.size == 3
github_api_from_com(parts)
# http(s)://<username>.github.io/<repo> is the strict way to pass
elsif uri.host&.downcase&.end_with?('.github.io') && (uri.host.count('.') == 2) && parts.size >= 2
github_api_from_io(uri, parts)
end
end

def github_api_from_com(parts)
"#{GITHUB_API_BASE}/#{parts[1]}/#{parts[2]}"
end

def github_api_from_io(uri, parts)
repo = parts[1]
owner = uri.host.split('.').first
"#{GITHUB_API_BASE}/#{owner}/#{repo}"
end

# Fetch cached data or opens webpage/api and cache it
# I chose to cache because GitHub limits up to 60 requests per hour for unauthenticated user
# https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28#primary-rate-limit-for-unauthenticated-users
# One GitHub URL equals to 4 GitHub API requests.
# key: string key for the cache
# url: url to open
# ttl: time-to-live in seconds (default 7 days)
def cache_fetch(key, url)
Rails.cache.fetch(key, expires_in: TTL, skip_nil: true) do
JSON.parse(open_url(url).read)
end
end

# Sets material hash keys and values and add them to material
def to_material(repo_data) # rubocop:disable Metrics/AbcSize
github_io_homepage = github_io_homepage? repo_data['homepage']
url = github_io_homepage ? repo_data['homepage'] : repo_data['html_url']
redirected_url = get_redirected_url(url)
html = get_html(redirected_url)

material = OpenStruct.new
material.title = repo_data['name'].titleize
material.url = url
material.description = github_io_homepage ? fetch_definition(html, redirected_url) : repo_data['description']
material.keywords = repo_data['topics']
material.licence = fetch_licence(repo_data['license'])
material.status = repo_data['archived'] ? 'Archived' : 'Active'
material.doi = fetch_doi(repo_data['full_name'])
material.version = fetch_latest_release(repo_data['full_name'])
material.date_created = repo_data['created_at']
material.date_published = repo_data['pushed_at']
material.date_modified = repo_data['updated_at']
material.contributors = fetch_contributors(repo_data['contributors_url'], repo_data['full_name'])
material.resource_type = github_io_homepage ? ['Github Page'] : ['Github Repository']
material.prerequisites = fetch_prerequisites(html)
material
end

def github_io_homepage?(homepage)
return false if homepage.nil? || homepage.empty?

url = URI(homepage)
url.host&.downcase&.end_with?('.github.io')
end

def get_html(url)
response = HTTParty.get(url, follow_redirects: true, headers: { 'User-Agent' => config[:user_agent] })
Nokogiri::HTML(response.body)
end

# DEFINITION – Opens the GitHub homepage, fetches the 3 first >50 char <p> tags'text
# and joins them with a 'Read more...' link at the end of the description
# Some of the first <p> tags were not descriptive, thus skipping them
def fetch_definition(html, url)
desc = ''
round = 3
html.css('p').each do |p|
p_txt = p&.text&.strip&.gsub(/\s+/, ' ') || ''
next if p_txt.length < 50 || round.zero?

desc = "#{desc}\n#{p_txt}"
round -= 1
end
"#{desc}\n(...) [Read more...](#{url})"
end

# LICENCE – Get proper licence
# the licence must match the format of config/dictionaries/licences.yml
def fetch_licence(licence)
return 'notspecified' if licence.nil? || licence == 'null'
return 'other-at' if licence['key'] == 'other'

licence['spdx_id']
end

# DOI – Fetches DOI from various sources in a repo
# I chose to only read the `README.md` as it seems to have the DOI badge almost everytime.
# Whereas enabling the fetching of CITATION.cff or CITATION.md would result in increasing
# the number of api request.
def fetch_doi(full_name)
filename = 'README.md'
url = "#{GITHUB_API_BASE}/#{full_name}/contents/#{filename}"
data = cache_fetch("#{CACHE_PREFIX}doi_#{full_name.gsub('/', '_')}_#{filename.downcase}", url)
return nil unless data && data['content']

decoded = Base64.decode64(data['content'])
doi_match = decoded.match(%r{doi.org/\s*([^\s,)]+)}i)
doi_match ? "https://doi.org/#{doi_match[1]}" : nil
end

# RELEASE – Opens releases API address and returns last release
def fetch_latest_release(full_name)
url = "#{GITHUB_API_BASE}/#{full_name}/releases"
releases = cache_fetch("#{CACHE_PREFIX}releases_#{full_name.gsub('/', '_')}", url)
releases.is_a?(Array) && releases.first ? releases.first['tag_name'] : nil
end

# CONTRIBUTORS – Opens contributors API address and returns list of contributors
def fetch_contributors(contributors_url, full_name)
contributors = cache_fetch("#{CACHE_PREFIX}contributors_#{full_name.gsub('/', '_')}", contributors_url)
return [] unless contributors

contributors.map { |c| (c['login']) }
end

# PREREQUISITES – From the homepage HTML, looks for <p> tags which are children of ...
def fetch_prerequisites(html)
prereq_paragraphs = []

# ... any heading tag (h1–h6) or span tag with text "prereq" (EN) or "prerreq" (ES)
prereq_paragraphs = fetch_prerequisites_from_h(html, prereq_paragraphs)

# ... any tag with id containing "prereq" (EN) or "prerreq" (ES)
prereq_paragraphs = fetch_prerequisites_from_id_or_class(html, prereq_paragraphs) if prereq_paragraphs.empty?

prereq_paragraphs&.join("\n")&.gsub(/\n\n+/, "\n")&.strip || ''
end

def fetch_prerequisites_from_h(html, prereq_paragraphs)
html.xpath('//h1|//h2|//h3|//h4|//h5|//h6|//span').each do |h|
next unless h.text =~ /prereq|prerreq/i # if prereq in text

paragraph = h.xpath('following-sibling::*')
.take_while { |sib| %w[p ul ol].include?(sib.name) } # take either p, ul or ol
prereq_paragraphs.concat(paragraph) if paragraph
end
prereq_paragraphs
end

def fetch_prerequisites_from_id_or_class(html, prereq_paragraphs)
html.xpath('//*[@id]').each do |node|
next unless prereq_node?(node)

extract_following_paragraphs(node, prereq_paragraphs)
extract_nested_paragraphs(node, prereq_paragraphs) if prereq_paragraphs.empty?
end
prereq_paragraphs
end

def prereq_node?(node)
[node['id'], node['class']].compact.any? { |attr| attr =~ /prereq|prerreq/i }
end

def extract_following_paragraphs(node, prereq_paragraphs)
paragraphs = node.xpath('following-sibling::*')
.take_while { |sib| %w[p ul ol].include?(sib.name) }
prereq_paragraphs.concat(paragraphs) if paragraphs
end

def extract_nested_paragraphs(node, prereq_paragraphs)
paragraphs = node.xpath('.//p | .//ul | .//ol')
prereq_paragraphs.concat(paragraphs) if paragraphs.any?
end
end
end
28 changes: 28 additions & 0 deletions lib/ingestors/ingestor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,34 @@ def open_url(url, raise: false, token: nil)
end
end

# Some URLs automatically redirects the user to another webpage
# This method gets a URL and returns the last redirected URL (as shown by a 30X response or a `meta[http-equiv="Refresh"]` tag)
def get_redirected_url(url, limit = 5) # rubocop:disable Metrics/AbcSize
raise 'Too many redirects' if limit.zero?

https_url = to_https(url) # some `homepage` were http
response = HTTParty.get(https_url, follow_redirects: true, headers: { 'User-Agent' => config[:user_agent] || 'TeSS Bot' })
return https_url unless response.headers['content-type']&.include?('html')

doc = Nokogiri::HTML(response.body)
meta = doc.at('meta[http-equiv="Refresh"]')
if meta && meta.to_s =~ /url=(.+)/i
content = meta['content']
relative_path = content[/url=(.+)/i, 1]
base = https_url.end_with?('/') ? https_url : "#{https_url}/"
escaped_path = URI::DEFAULT_PARSER.escape(relative_path).to_s
new_url = "#{base}#{escaped_path}"
return get_redirected_url(new_url, limit - 1)
end
https_url
end

def to_https(url)
uri = URI.parse(url)
uri.scheme = 'https'
uri.to_s
end

def convert_description(input)
return input if input.nil?

Expand Down
1 change: 1 addition & 0 deletions lib/ingestors/ingestor_factory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def self.ingestors
Ingestors::MaterialCsvIngestor,
Ingestors::TessEventIngestor,
Ingestors::ZenodoIngestor,
Ingestors::GithubIngestor,
] + taxila_ingestors + llm_ingestors
end

Expand Down
25 changes: 25 additions & 0 deletions test/fixtures/files/ingestion/github/api-github-com.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"name": "cpluspluscourse",
"full_name": "hsf-training/cpluspluscourse",
"owner": {
"login": "hsf-training"
},
"html_url": "https://github.com/hsf-training/cpluspluscourse",
"description": "C++ Course Taught at CERN",
"homepage": "",
"topics": [
"those",
"are",
"keywords"
],
"license": {
"key": "apache-2.0",
"name": "Apache License 2.0",
"spdx_id": "Apache-2.0"
},
"archived": true,
"created_at": "2025-09-29T14:38:38Z",
"updated_at": "2025-09-30T14:38:38Z",
"pushed_at": "2025-09-28T14:38:38Z",
"contributors_url": "https://api.github.com/repos/hsf-training/cpluspluscourse/contributors"
}
25 changes: 25 additions & 0 deletions test/fixtures/files/ingestion/github/api-github-io-01.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"name": "python-novice-inflammation",
"full_name": "swcarpentry/python-novice-inflammation",
"owner": {
"login": "swcarpentry"
},
"html_url": "https://github.com/swcarpentry/python-novice-inflammation",
"description": "This is not going to be read",
"homepage": "https://swcarpentry.github.io/python-novice-inflammation/",
"topics": [
"key",
"words",
"in topics"
],
"license": {
"key": "apache-2.0",
"name": "Apache License 2.0",
"spdx_id": "Apache-2.0"
},
"archived": false,
"created_at": "2025-09-29T14:38:38Z",
"updated_at": "2025-09-30T14:38:38Z",
"pushed_at": "2025-09-28T14:38:38Z",
"contributors_url": "https://api.github.com/repos/swcarpentry/python-novice-inflammation/contributors"
}
25 changes: 25 additions & 0 deletions test/fixtures/files/ingestion/github/api-github-io-02.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"name": "hsf-training-scikit-hep-webpage",
"full_name": "hsf-training/hsf-training-scikit-hep-webpage",
"owner": {
"login": "hsf-training"
},
"html_url": "https://github.com/hsf-training/hsf-training-scikit-hep-webpage",
"description": null,
"homepage": "https://hsf-training.github.io/hsf-training-scikit-hep-webpage/",
"topics": [
"hacktoberfest",
"hey",
"test"
],
"license": {
"key": "other",
"name": "Other",
"spdx_id": "NOASSERTION"
},
"archived": false,
"created_at": "2022-03-23T17:00:05Z",
"updated_at": "2025-09-29T06:14:55Z",
"pushed_at": "2025-09-23T20:09:10Z",
"contributors_url": "https://api.github.com/repos/hsf-training/hsf-training-scikit-hep-webpage/contributors"
}
11 changes: 11 additions & 0 deletions test/fixtures/files/ingestion/github/api-modified.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "bigchange",
"full_name": "hsf-training/cpluspluscourse",
"html_url": "https://github.com/hsf-training/cpluspluscourse",
"topics": [
"those",
"are",
"NOT"
],
"contributors_url": "https://api.github.com/repos/hsf-training/cpluspluscourse/contributors"
}
8 changes: 8 additions & 0 deletions test/fixtures/files/ingestion/github/contributors.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[
{
"login": "jane"
},
{
"login": "doe"
}
]
Loading
Loading