ElixirTeSS · fbacall · Dec 19, 2025 · Aug 20, 2025 · Sep 30, 2025 · Oct 6, 2025
diff --git a/lib/ingestors/github_ingestor.rb b/lib/ingestors/github_ingestor.rb
@@ -0,0 +1,243 @@
+# frozen_string_literal: true
+
+require 'open-uri'
+require 'json'
+require 'httparty'
+require 'nokogiri'
+
+module Ingestors
+  # GithubIngestor fetches repository information from GitHub to populate the materials' metadata.
+  # API requests counter:
+  # 1. Get the repo's general metadata            #{GITHUB_API_BASE}/#{full_name}
+  #    and keys:                                  name, full_name, owner.login, html_url, description,
+  #                                               homepage, topics, license.{key, spdx}, archived,
+  #                                               created_at, pushed_at, updated_at, contributors_url
+  # 2. Get the doi                                #{GITHUB_API_BASE}/#{full_name}/contents/README.md
+  #    and key:                                   content
+  # 3. Get the version/release                    #{GITHUB_API_BASE}/#{full_name}/releases
+  #    and key:                                   tag_name (first)
+  # 4. Get the contributors' list                 #{GITHUB_API_BASE}/#{full_name}/contributors
+  #    and key:                                   login (from all entries)
+  class GithubIngestor < Ingestor # rubocop:disable Metrics/ClassLength
+    include Ingestors::Concerns::SitemapHelpers
+
+    GITHUB_API_BASE = 'https://api.github.com/repos'
+    CACHE_PREFIX = 'github_ingestor_'
+    TTL = 1.week # cache expiration time (time to live before cache expires)
+
+    def self.config
+      {
+        key: 'github',
+        title: 'GitHub Repository or Page',
+        category: :materials,
+        user_agent: 'TeSS Github ingestor'
+      }
+    end
+
+    # Reads from direct GitHub URLs, .xml sitemaps, and .txt sitemaps.
+    # Fetches repository metadata, contributors, releases, and DOIs (from CITATION.cff).
+    # It handles automatically GitHub Pages URLs (github.io) and standard github.com URLs.
+    # It caches API requests to avoid repeated calls.
+    def read(source_url)
+      @verbose = false
+      # Returns either a map of unique URL entries, either the URL itself
+      sources = parse_sitemap(source_url)
+
+      sources.each do |url|
+        # Reads each source, if github.{com|io}, gets the repo's api, if not, next
+        repo_api_url = to_github_api(url)
+        next unless repo_api_url
+
+        # Gets the cached repo data or reads and sets it
+        key = "#{CACHE_PREFIX}#{repo_api_url.gsub(%r{https?://}, '').gsub('/', '_')}"
+        repo_data = cache_fetch(key, repo_api_url)
+        next unless repo_data
+
+        # Add to material
+        add_material to_material(repo_data)
+      end
+    rescue StandardError => e
+      Rails.logger.error("#{e.class}: read() failed, #{e.message}")
+    end
+
+    private
+
+    # Takes a github.{com|io} url and returns its api.github.com url
+    def to_github_api(url)
+      uri = URI(url)
+      parts = uri.path.split('/') # 'example.com/foo/bar' will have path == '/foo/bar', so three parts
+
+      # http(s)://github.com/<username>/<repo> is the strict way to pass
+      if uri.host&.downcase == 'github.com' && (uri.host.count('.') == 1) && parts.size == 3
+        github_api_from_com(parts)
+      # http(s)://<username>.github.io/<repo> is the strict way to pass
+      elsif uri.host&.downcase&.end_with?('.github.io') && (uri.host.count('.') == 2) && parts.size >= 2
+        github_api_from_io(uri, parts)
+      end
+    end
+
+    def github_api_from_com(parts)
+      "#{GITHUB_API_BASE}/#{parts[1]}/#{parts[2]}"
+    end
+
+    def github_api_from_io(uri, parts)
+      repo  = parts[1]
+      owner = uri.host.split('.').first
+      "#{GITHUB_API_BASE}/#{owner}/#{repo}"
+    end
+
+    # Fetch cached data or opens webpage/api and cache it
+    # I chose to cache because GitHub limits up to 60 requests per hour for unauthenticated user
+    # https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28#primary-rate-limit-for-unauthenticated-users
+    # One GitHub URL equals to 4 GitHub API requests.
+    # key: string key for the cache
+    # url: url to open
+    # ttl: time-to-live in seconds (default 7 days)
+    def cache_fetch(key, url)
+      Rails.cache.fetch(key, expires_in: TTL, skip_nil: true) do
+        JSON.parse(open_url(url).read)
+      end
+    end
+
+    # Sets material hash keys and values and add them to material
+    def to_material(repo_data) # rubocop:disable Metrics/AbcSize
+      github_io_homepage = github_io_homepage? repo_data['homepage']
+      url = github_io_homepage ? repo_data['homepage'] : repo_data['html_url']
+      redirected_url = get_redirected_url(url)
+      html = get_html(redirected_url)
+
+      material = OpenStruct.new
+      material.title = repo_data['name'].titleize
+      material.url = url
+      material.description = github_io_homepage ? fetch_definition(html, redirected_url) : repo_data['description']
+      material.keywords = repo_data['topics']
+      material.licence = fetch_licence(repo_data['license'])
+      material.status = repo_data['archived'] ? 'Archived' : 'Active'
+      material.doi = fetch_doi(repo_data['full_name'])
+      material.version = fetch_latest_release(repo_data['full_name'])
+      material.date_created = repo_data['created_at']
+      material.date_published = repo_data['pushed_at']
+      material.date_modified = repo_data['updated_at']
+      material.contributors = fetch_contributors(repo_data['contributors_url'], repo_data['full_name'])
+      material.resource_type = github_io_homepage ? ['Github Page'] : ['Github Repository']
+      material.prerequisites = fetch_prerequisites(html)
+      material
+    end
+
+    def github_io_homepage?(homepage)
+      return false if homepage.nil? || homepage.empty?
+
+      url = URI(homepage)
+      url.host&.downcase&.end_with?('.github.io')
+    end
+
+    def get_html(url)
+      response = HTTParty.get(url, follow_redirects: true, headers: { 'User-Agent' => config[:user_agent] })
+      Nokogiri::HTML(response.body)
+    end
+
+    # DEFINITION – Opens the GitHub homepage, fetches the 3 first >50 char <p> tags'text
+    # and joins them with a 'Read more...' link at the end of the description
+    # Some of the first <p> tags were not descriptive, thus skipping them
+    def fetch_definition(html, url)
+      desc = ''
+      round = 3
+      html.css('p').each do |p|
+        p_txt = p&.text&.strip&.gsub(/\s+/, ' ') || ''
+        next if p_txt.length < 50 || round.zero?
+
+        desc = "#{desc}\n#{p_txt}"
+        round -= 1
+      end
+      "#{desc}\n(...) [Read more...](#{url})"
+    end
+
+    # LICENCE – Get proper licence
+    # the licence must match the format of config/dictionaries/licences.yml
+    def fetch_licence(licence)
+      return 'notspecified' if licence.nil? || licence == 'null'
+      return 'other-at' if licence['key'] == 'other'
+
+      licence['spdx_id']
+    end
+
+    # DOI – Fetches DOI from various sources in a repo
+    # I chose to only read the `README.md` as it seems to have the DOI badge almost everytime.
+    # Whereas enabling the fetching of CITATION.cff or CITATION.md would result in increasing
+    # the number of api request.
+    def fetch_doi(full_name)
+      filename = 'README.md'
+      url = "#{GITHUB_API_BASE}/#{full_name}/contents/#{filename}"
+      data = cache_fetch("#{CACHE_PREFIX}doi_#{full_name.gsub('/', '_')}_#{filename.downcase}", url)
+      return nil unless data && data['content']
+
+      decoded = Base64.decode64(data['content'])
+      doi_match = decoded.match(%r{doi.org/\s*([^\s,)]+)}i)
+      doi_match ? "https://doi.org/#{doi_match[1]}" : nil
+    end
+
+    # RELEASE – Opens releases API address and returns last release
+    def fetch_latest_release(full_name)
+      url = "#{GITHUB_API_BASE}/#{full_name}/releases"
+      releases = cache_fetch("#{CACHE_PREFIX}releases_#{full_name.gsub('/', '_')}", url)
+      releases.is_a?(Array) && releases.first ? releases.first['tag_name'] : nil
+    end
+
+    # CONTRIBUTORS – Opens contributors API address and returns list of contributors
+    def fetch_contributors(contributors_url, full_name)
+      contributors = cache_fetch("#{CACHE_PREFIX}contributors_#{full_name.gsub('/', '_')}", contributors_url)
+      return [] unless contributors
+
+      contributors.map { |c| (c['login']) }
+    end
+
+    # PREREQUISITES – From the homepage HTML, looks for <p> tags which are children of ...
+    def fetch_prerequisites(html)
+      prereq_paragraphs = []
+
+      # ... any heading tag (h1–h6) or span tag with text "prereq" (EN) or "prerreq" (ES)
+      prereq_paragraphs = fetch_prerequisites_from_h(html, prereq_paragraphs)
+
+      # ... any tag with id containing "prereq" (EN) or "prerreq" (ES)
+      prereq_paragraphs = fetch_prerequisites_from_id_or_class(html, prereq_paragraphs) if prereq_paragraphs.empty?
+
+      prereq_paragraphs&.join("\n")&.gsub(/\n\n+/, "\n")&.strip || ''
+    end
+
+    def fetch_prerequisites_from_h(html, prereq_paragraphs)
+      html.xpath('//h1|//h2|//h3|//h4|//h5|//h6|//span').each do |h|
+        next unless h.text =~ /prereq|prerreq/i # if prereq in text
+
+        paragraph = h.xpath('following-sibling::*')
+                     .take_while { |sib| %w[p ul ol].include?(sib.name) } # take either p, ul or ol
+        prereq_paragraphs.concat(paragraph) if paragraph
+      end
+      prereq_paragraphs
+    end
+
+    def fetch_prerequisites_from_id_or_class(html, prereq_paragraphs)
+      html.xpath('//*[@id]').each do |node|
+        next unless prereq_node?(node)
+
+        extract_following_paragraphs(node, prereq_paragraphs)
+        extract_nested_paragraphs(node, prereq_paragraphs) if prereq_paragraphs.empty?
+      end
+      prereq_paragraphs
+    end
+
+    def prereq_node?(node)
+      [node['id'], node['class']].compact.any? { |attr| attr =~ /prereq|prerreq/i }
+    end
+
+    def extract_following_paragraphs(node, prereq_paragraphs)
+      paragraphs = node.xpath('following-sibling::*')
+                       .take_while { |sib| %w[p ul ol].include?(sib.name) }
+      prereq_paragraphs.concat(paragraphs) if paragraphs
+    end
+
+    def extract_nested_paragraphs(node, prereq_paragraphs)
+      paragraphs = node.xpath('.//p | .//ul | .//ol')
+      prereq_paragraphs.concat(paragraphs) if paragraphs.any?
+    end
+  end
+end
diff --git a/lib/ingestors/ingestor.rb b/lib/ingestors/ingestor.rb
@@ -72,6 +72,34 @@ def open_url(url, raise: false, token: nil)
       end
     end
 
+    # Some URLs automatically redirects the user to another webpage
+    # This method gets a URL and returns the last redirected URL (as shown by a 30X response or a `meta[http-equiv="Refresh"]` tag)
+    def get_redirected_url(url, limit = 5) # rubocop:disable Metrics/AbcSize
+      raise 'Too many redirects' if limit.zero?
+
+      https_url = to_https(url) # some `homepage` were http
+      response = HTTParty.get(https_url, follow_redirects: true, headers: { 'User-Agent' => config[:user_agent] || 'TeSS Bot' })
+      return https_url unless response.headers['content-type']&.include?('html')
+
+      doc = Nokogiri::HTML(response.body)
+      meta = doc.at('meta[http-equiv="Refresh"]')
+      if meta && meta.to_s =~ /url=(.+)/i
+        content = meta['content']
+        relative_path = content[/url=(.+)/i, 1]
+        base = https_url.end_with?('/') ? https_url : "#{https_url}/"
+        escaped_path = URI::DEFAULT_PARSER.escape(relative_path).to_s
+        new_url = "#{base}#{escaped_path}"
+        return get_redirected_url(new_url, limit - 1)
+      end
+      https_url
+    end
+
+    def to_https(url)
+      uri = URI.parse(url)
+      uri.scheme = 'https'
+      uri.to_s
+    end
+
     def convert_description(input)
       return input if input.nil?
 

diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb
@@ -11,6 +11,7 @@ def self.ingestors
         Ingestors::MaterialCsvIngestor,
         Ingestors::TessEventIngestor,
         Ingestors::ZenodoIngestor,
+        Ingestors::GithubIngestor,
       ] + taxila_ingestors + llm_ingestors
     end
 

diff --git a/test/fixtures/files/ingestion/github/api-github-com.json b/test/fixtures/files/ingestion/github/api-github-com.json
@@ -0,0 +1,25 @@
+{
+  "name": "cpluspluscourse",
+  "full_name": "hsf-training/cpluspluscourse",
+  "owner": {
+    "login": "hsf-training"
+  },
+  "html_url": "https://github.com/hsf-training/cpluspluscourse",
+  "description": "C++ Course Taught at CERN",
+  "homepage": "",
+  "topics": [
+    "those",
+    "are",
+    "keywords"
+  ],
+  "license": {
+    "key": "apache-2.0",
+    "name": "Apache License 2.0",
+    "spdx_id": "Apache-2.0"
+  },
+  "archived": true,
+  "created_at": "2025-09-29T14:38:38Z",
+  "updated_at": "2025-09-30T14:38:38Z",
+  "pushed_at": "2025-09-28T14:38:38Z",
+  "contributors_url": "https://api.github.com/repos/hsf-training/cpluspluscourse/contributors"
+}
diff --git a/test/fixtures/files/ingestion/github/api-github-io-01.json b/test/fixtures/files/ingestion/github/api-github-io-01.json
@@ -0,0 +1,25 @@
+{
+  "name": "python-novice-inflammation",
+  "full_name": "swcarpentry/python-novice-inflammation",
+  "owner": {
+    "login": "swcarpentry"
+  },
+  "html_url": "https://github.com/swcarpentry/python-novice-inflammation",
+  "description": "This is not going to be read",
+  "homepage": "https://swcarpentry.github.io/python-novice-inflammation/",
+  "topics": [
+    "key",
+    "words",
+    "in topics"
+  ],
+  "license": {
+    "key": "apache-2.0",
+    "name": "Apache License 2.0",
+    "spdx_id": "Apache-2.0"
+  },
+  "archived": false,
+  "created_at": "2025-09-29T14:38:38Z",
+  "updated_at": "2025-09-30T14:38:38Z",
+  "pushed_at": "2025-09-28T14:38:38Z",
+  "contributors_url": "https://api.github.com/repos/swcarpentry/python-novice-inflammation/contributors"
+}
diff --git a/test/fixtures/files/ingestion/github/api-github-io-02.json b/test/fixtures/files/ingestion/github/api-github-io-02.json
@@ -0,0 +1,25 @@
+{
+  "name": "hsf-training-scikit-hep-webpage",
+  "full_name": "hsf-training/hsf-training-scikit-hep-webpage",
+  "owner": {
+    "login": "hsf-training"
+  },
+  "html_url": "https://github.com/hsf-training/hsf-training-scikit-hep-webpage",
+  "description": null,
+  "homepage": "https://hsf-training.github.io/hsf-training-scikit-hep-webpage/",
+  "topics": [
+    "hacktoberfest",
+    "hey",
+    "test"
+  ],
+  "license": {
+    "key": "other",
+    "name": "Other",
+    "spdx_id": "NOASSERTION"
+  },
+  "archived": false,
+  "created_at": "2022-03-23T17:00:05Z",
+  "updated_at": "2025-09-29T06:14:55Z",
+  "pushed_at": "2025-09-23T20:09:10Z",
+  "contributors_url": "https://api.github.com/repos/hsf-training/hsf-training-scikit-hep-webpage/contributors"
+}
diff --git a/test/fixtures/files/ingestion/github/api-modified.json b/test/fixtures/files/ingestion/github/api-modified.json
@@ -0,0 +1,11 @@
+{
+  "name": "bigchange",
+  "full_name": "hsf-training/cpluspluscourse",
+  "html_url": "https://github.com/hsf-training/cpluspluscourse",
+  "topics": [
+    "those",
+    "are",
+    "NOT"
+  ],
+  "contributors_url": "https://api.github.com/repos/hsf-training/cpluspluscourse/contributors"
+}
diff --git a/test/fixtures/files/ingestion/github/contributors.json b/test/fixtures/files/ingestion/github/contributors.json
@@ -0,0 +1,8 @@
+[
+  {
+    "login": "jane"
+  },
+  {
+    "login": "doe"
+  }
+]