Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ee75675
Initial plan
Copilot May 22, 2026
7e6f161
Refactor Dublin Core ingestion from OAI-PMH ingestor
eilmiv Apr 7, 2026
174e18b
Add RSS ingestion for materials and events
eilmiv Apr 7, 2026
7870fc4
Add tests for RSS ingestors
eilmiv Apr 7, 2026
cd603d6
Add ingestors to factory
eilmiv Apr 8, 2026
d42a85b
Add support for common extensions
eilmiv Apr 8, 2026
70c9c20
Fix Zeitwerk inflection problem with RSS
eilmiv Apr 8, 2026
8aa69df
Add support for relative urls
eilmiv Apr 8, 2026
1a64ba7
Fixes from testing many RSS feeds
eilmiv Apr 8, 2026
a8a8025
Remove start and end date for events based on date published in rss
eilmiv Apr 8, 2026
1922f9b
Add feed url discovery from youtube url
eilmiv Apr 8, 2026
084c649
Fix error class that was too specific
eilmiv Apr 8, 2026
7674a4b
Fix link handling in atom feeds
eilmiv Apr 8, 2026
41faea6
Use relative import for loading the custom rss media extention
eilmiv Apr 9, 2026
2390ee7
Add comment for dublin core to text conversion options
eilmiv Apr 9, 2026
bb2b841
Improve error message when there is an unsupported feed type.
eilmiv May 13, 2026
b10eb87
Reuse code from youtube renderer for youtube link detection in RSS in…
eilmiv May 13, 2026
d20b27a
Small refactors in rss ingestion
eilmiv May 21, 2026
85a0070
More specific errors in rss ingestion
eilmiv May 21, 2026
af34912
Refactor Yahoo Media RSS namespace patch
eilmiv May 21, 2026
f0d02e6
Separate youtube ingestor
eilmiv May 22, 2026
8eb6968
Remove event rss ingestor
eilmiv May 22, 2026
34e9a6b
Test youtube ingestor
eilmiv May 22, 2026
8ec8978
Make YoutubeIngestor discover_feed_url private
Copilot May 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions config/initializers/inflections.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,7 @@
# ActiveSupport::Inflector.inflections(:en) do |inflect|
# inflect.acronym "RESTful"
# end

ActiveSupport::Inflector.inflections(:en) do |inflect|
inflect.acronym 'RSS'
end
61 changes: 61 additions & 0 deletions config/initializers/rss_media_atom_patch.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
require 'rss'
require 'rss/atom'

# Extension for the Yahoo Media RSS namespace (xmlns:media="http://search.yahoo.com/mrss/").
# Used by feeds that carry rich media metadata, e.g. YouTube channel feeds which include
# <media:group>, <media:title>, and <media:description> elements.

module RSS
module Media
MEDIA_PREFIX = 'media'
MEDIA_URI = 'http://search.yahoo.com/mrss/'

module MediaGroupDescriptionModel
extend BaseModel

def self.append_features(klass)
super
return if klass.instance_of?(Module)

klass.install_must_call_validator(MEDIA_PREFIX, MEDIA_URI)
klass.install_have_child_element('group', MEDIA_URI, '?', 'media_group')
end
end

BaseListener.install_class_name(MEDIA_URI, 'group', 'MediaGroup')
BaseListener.install_get_text_element(MEDIA_URI, 'title', 'media_title')
BaseListener.install_get_text_element(MEDIA_URI, 'description', 'media_description')
end

module Atom
Feed.install_ns(Media::MEDIA_PREFIX, Media::MEDIA_URI)

class Feed
include Media::MediaGroupDescriptionModel

class Entry
include Media::MediaGroupDescriptionModel

class MediaGroup < Element
include RSS09

@tag_name = 'group'

class << self
def required_prefix
Media::MEDIA_PREFIX
end

def required_uri
Media::MEDIA_URI
end
end

install_must_call_validator(Media::MEDIA_PREFIX, Media::MEDIA_URI)
install_text_element('title', Media::MEDIA_URI, '?', 'media_title')
install_text_element('description', Media::MEDIA_URI, '?', 'media_description')
end
end
end
end
end
81 changes: 81 additions & 0 deletions lib/ingestors/dublin_core_ingestion.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
module Ingestors
module DublinCoreIngestion
def build_material_from_dublin_core_data(dc)
material = OpenStruct.new

material.title = dc[:title]
material.description = convert_description(dc[:description])
material.authors = normalize_dublin_core_values(dc[:creators])
material.contributors = normalize_dublin_core_values(dc[:contributors])

rights = normalize_dublin_core_values(dc[:rights])
material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified'

parsed_dates = parse_dublin_core_dates(dc[:dates])
material.date_created = parsed_dates.first
material.date_modified = parsed_dates.last if parsed_dates.size > 1

identifiers = normalize_dublin_core_values(dc[:identifiers])
material.doi = extract_dublin_core_doi(identifiers)
material.url = identifiers.find { |id| id.start_with?('http://', 'https://') }

material.keywords = normalize_dublin_core_values(dc[:subjects])
material.resource_type = normalize_dublin_core_values(dc[:types])
material.contact = dublin_core_text(dc[:publisher])

material
end

def build_event_from_dublin_core_data(dc)
event = OpenStruct.new

event.title = dc[:title]
event.description = convert_description(dc[:description])
event.organizer = normalize_dublin_core_values(dc[:creators]).first
event.contact = dublin_core_text(dc[:publisher]) || event.organizer
event.keywords = normalize_dublin_core_values(dc[:subjects])
event.event_types = normalize_dublin_core_values(dc[:types])

dates = parse_dublin_core_dates(dc[:dates])
event.start = dates.first
event.end = dates.last || dates.first

identifiers = normalize_dublin_core_values(dc[:identifiers])
event.url = identifiers.find { |id| id.start_with?('http://', 'https://') }

event
end

def parse_dublin_core_dates(dates)
normalize_dublin_core_values(dates).map do |date_value|
Date.parse(date_value)
rescue Date::Error, ArgumentError
nil
end.compact
end

def extract_dublin_core_doi(identifiers)
doi = normalize_dublin_core_values(identifiers).find do |id|
id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/')
end
return nil unless doi

normalized = doi.sub(%r{https?://doi\.org/}, '')
"https://doi.org/#{normalized}"
end

def normalize_dublin_core_values(values)
Array(values).map { |v| dublin_core_text(v).to_s.strip }
.reject(&:blank?).uniq
end

# this method is also used by RSS ingestion under an alias
def dublin_core_text(value)
return nil if value.nil?
return value.content if value.respond_to?(:content) # rss gem xml nodes
return value.text if value.respond_to?(:text) && !value.is_a?(String) # Nokogiri xml nodes

value.to_s
end
end
end
2 changes: 2 additions & 0 deletions lib/ingestors/ingestor_factory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ def self.ingestors
Ingestors::ZenodoIngestor,
Ingestors::OaiPmhIngestor,
Ingestors::GithubIngestor,
Ingestors::MaterialRSSIngestor,
Ingestors::YoutubeIngestor
] + taxila_ingestors + llm_ingestors + heptraining_ingestors
end

Expand Down
Loading