2022-05-26 18:31:02 +02:00
require " ../helpers/serialized_yt_data "
2021-05-04 10:48:51 +02:00
# This file contains helper methods to parse the Youtube API json data into
# neat little packages we can use
# Tuple of Parsers/Extractors so we can easily cycle through them.
private ITEM_CONTAINER_EXTRACTOR = {
2021-08-03 09:22:31 +02:00
Extractors :: YouTubeTabs ,
Extractors :: SearchResults ,
2022-11-05 18:56:35 +01:00
Extractors :: ContinuationContent ,
2021-05-04 10:48:51 +02:00
}
private ITEM_PARSERS = {
2023-07-15 15:41:04 +02:00
Parsers :: RichItemRendererParser ,
2021-08-03 09:22:31 +02:00
Parsers :: VideoRendererParser ,
Parsers :: ChannelRendererParser ,
Parsers :: GridPlaylistRendererParser ,
Parsers :: PlaylistRendererParser ,
Parsers :: CategoryRendererParser ,
2022-10-31 21:30:10 +01:00
Parsers :: ReelItemRendererParser ,
2023-03-22 02:24:37 +01:00
Parsers :: ItemSectionRendererParser ,
2022-11-05 18:56:35 +01:00
Parsers :: ContinuationItemRendererParser ,
2023-07-15 15:41:04 +02:00
Parsers :: HashtagRendererParser ,
2021-05-04 10:48:51 +02:00
}
2022-11-10 23:32:51 +01:00
private alias InitialData = Hash ( String , JSON :: Any )
2021-09-28 17:19:55 +02:00
record AuthorFallback , name : String , id : String
2021-05-04 10:48:51 +02:00
2021-08-06 05:31:48 +02:00
# Namespace for logic relating to parsing InnerTube data into various datastructs.
#
# Each of the parsers in this namespace are accessed through the #process() method
# which validates the given data as applicable to itself. If it is applicable the given
# data is passed to the private `#parse()` method which returns a datastruct of the given
# type. Otherwise, nil is returned.
2021-08-03 09:22:31 +02:00
private module Parsers
2021-08-06 05:31:48 +02:00
# Parses a InnerTube videoRenderer into a SearchVideo. Returns nil when the given object isn't a videoRenderer
#
# A videoRenderer renders a video to click on within the YouTube and Invidious UI. It is **not**
# the watchable video itself.
#
# See specs for example.
#
# `videoRenderer`s can be found almost everywhere on YouTube. In categories, search results, channels, etc.
#
2021-08-03 09:22:31 +02:00
module VideoRendererParser
def self . process ( item : JSON :: Any , author_fallback : AuthorFallback )
if item_contents = ( item [ " videoRenderer " ]? || item [ " gridVideoRenderer " ]? )
return self . parse ( item_contents , author_fallback )
end
end
2021-05-04 10:48:51 +02:00
2021-08-03 09:22:31 +02:00
private def self . parse ( item_contents , author_fallback )
video_id = item_contents [ " videoId " ] . as_s
2021-10-12 16:56:15 +02:00
title = extract_text ( item_contents [ " title " ]? ) || " "
2021-08-03 09:22:31 +02:00
2021-08-04 06:22:34 +02:00
# Extract author information
2021-08-05 04:54:41 +02:00
if author_info = item_contents . dig? ( " ownerText " , " runs " , 0 )
2021-08-04 06:22:34 +02:00
author = author_info [ " text " ] . as_s
2021-09-28 17:39:00 +02:00
author_id = HelperExtractors . get_browse_id ( author_info )
2021-12-12 20:58:45 +01:00
elsif author_info = item_contents . dig? ( " shortBylineText " , " runs " , 0 )
author = author_info [ " text " ] . as_s
author_id = HelperExtractors . get_browse_id ( author_info )
2021-08-04 06:22:34 +02:00
else
2021-09-28 17:19:55 +02:00
author = author_fallback . name
author_id = author_fallback . id
2021-08-04 06:22:34 +02:00
end
2021-08-03 09:22:31 +02:00
2022-06-01 23:07:18 +02:00
author_verified = has_verified_badge? ( item_contents [ " ownerBadges " ]? )
2021-08-04 06:22:34 +02:00
# For live videos (and possibly recently premiered videos) there is no published information.
# Instead, in its place is the amount of people currently watching. This behavior should be replicated
# on Invidious once all features of livestreams are supported. On an unrelated note, defaulting to the current
# time for publishing isn't a good idea.
2021-08-05 04:54:41 +02:00
published = item_contents . dig? ( " publishedTimeText " , " simpleText " ) . try { | t | decode_date ( t . as_s ) } || Time . local
2021-08-04 06:22:34 +02:00
# Typically views are stored under a "simpleText" in the "viewCountText". However, for
# livestreams and premiered it is stored under a "runs" array: [{"text":123}, {"text": "watching"}]
# When view count is disabled the "viewCountText" is not present on InnerTube data.
# TODO change default value to nil and typical encoding type to tuple storing type (watchers, views, etc)
# and count
view_count = item_contents . dig? ( " viewCountText " , " simpleText " ) . try & . as_s . gsub ( / \ D+ / , " " ) . to_i64? || 0 _i64
2022-05-01 18:48:08 +02:00
description_html = item_contents [ " descriptionSnippet " ]? . try { | t | parse_content ( t , video_id ) } || " "
2021-08-04 06:22:34 +02:00
2021-12-12 20:58:45 +01:00
# The length information generally exist in "lengthText". However, the info can sometimes
# be retrieved from "thumbnailOverlays" (e.g when the video is a "shorts" one).
2021-08-04 06:22:34 +02:00
if length_container = item_contents [ " lengthText " ]?
length_seconds = decode_length_seconds ( length_container [ " simpleText " ] . as_s )
elsif length_container = item_contents [ " thumbnailOverlays " ]? . try & . as_a . find ( & . [ " thumbnailOverlayTimeStatusRenderer " ]? )
2021-10-07 23:39:21 +02:00
# This needs to only go down the `simpleText` path (if possible). If more situations came up that requires
# a specific pathway then we should add an argument to extract_text that'll make this possible
2021-12-12 20:58:45 +01:00
length_text = length_container . dig? ( " thumbnailOverlayTimeStatusRenderer " , " text " , " simpleText " )
if length_text
length_text = length_text . as_s
2021-10-07 23:39:21 +02:00
2021-12-12 20:58:45 +01:00
if length_text == " SHORTS "
# Approximate length to one minute, as "shorts" generally don't exceed that length.
# TODO: Add some sort of metadata for the type of video (normal, live, premiere, shorts)
length_seconds = 60_i32
else
length_seconds = decode_length_seconds ( length_text )
end
2021-10-07 23:39:21 +02:00
else
length_seconds = 0
end
2021-08-04 06:22:34 +02:00
else
length_seconds = 0
end
2021-08-03 09:22:31 +02:00
live_now = false
paid = false
premium = false
2021-08-04 06:22:34 +02:00
premiere_timestamp = item_contents . dig? ( " upcomingEventData " , " startTime " ) . try { | t | Time . unix ( t . as_s . to_i64 ) }
2021-08-03 09:22:31 +02:00
item_contents [ " badges " ]? . try & . as_a . each do | badge |
b = badge [ " metadataBadgeRenderer " ]
case b [ " label " ] . as_s
when " LIVE NOW "
live_now = true
when " New " , " 4K " , " CC "
# TODO
when " Premium "
# TODO: Potentially available as item_contents["topStandaloneBadge"]["metadataBadgeRenderer"]
premium = true
else nil # Ignore
end
end
2021-05-04 10:48:51 +02:00
2021-08-03 09:22:31 +02:00
SearchVideo . new ( {
title : title ,
id : video_id ,
author : author ,
ucid : author_id ,
published : published ,
views : view_count ,
description_html : description_html ,
length_seconds : length_seconds ,
live_now : live_now ,
premium : premium ,
premiere_timestamp : premiere_timestamp ,
2022-06-01 23:07:18 +02:00
author_verified : author_verified ,
2021-08-03 09:22:31 +02:00
} )
2021-05-04 10:48:51 +02:00
end
2021-10-07 23:39:21 +02:00
def self . parser_name
return {{ @type . name }}
end
2021-05-04 10:48:51 +02:00
end
2021-08-06 05:31:48 +02:00
# Parses a InnerTube channelRenderer into a SearchChannel. Returns nil when the given object isn't a channelRenderer
#
# A channelRenderer renders a channel to click on within the YouTube and Invidious UI. It is **not**
# the channel page itself.
#
# See specs for example.
#
# `channelRenderer`s can be found almost everywhere on YouTube. In categories, search results, channels, etc.
#
2021-08-03 09:22:31 +02:00
module ChannelRendererParser
def self . process ( item : JSON :: Any , author_fallback : AuthorFallback )
if item_contents = ( item [ " channelRenderer " ]? || item [ " gridChannelRenderer " ]? )
return self . parse ( item_contents , author_fallback )
2021-05-04 10:48:51 +02:00
end
end
2021-08-03 09:22:31 +02:00
private def self . parse ( item_contents , author_fallback )
2021-09-28 17:19:55 +02:00
author = extract_text ( item_contents [ " title " ] ) || author_fallback . name
author_id = item_contents [ " channelId " ]? . try & . as_s || author_fallback . id
2022-06-01 23:07:18 +02:00
author_verified = has_verified_badge? ( item_contents [ " ownerBadges " ]? )
2021-08-04 06:22:34 +02:00
author_thumbnail = HelperExtractors . get_thumbnails ( item_contents )
2022-06-01 23:07:18 +02:00
2021-08-04 06:22:34 +02:00
# When public subscriber count is disabled, the subscriberCountText isn't sent by InnerTube.
2021-08-05 04:54:41 +02:00
# Always simpleText
2021-08-04 06:22:34 +02:00
# TODO change default value to nil
2023-01-04 02:18:10 +01:00
2023-07-18 17:06:50 +02:00
subscriber_count = item_contents . dig? ( " subscriberCountText " , " simpleText " ) . try & . as_s
channel_handle = subscriber_count if ( subscriber_count . try & . starts_with? " @ " )
2023-01-06 02:42:11 +01:00
# Since youtube added channel handles, `VideoCountText` holds the number of
# subscribers and `subscriberCountText` holds the handle, except when the
# channel doesn't have a handle (e.g: some topic music channels).
# See https://github.com/iv-org/invidious/issues/3394#issuecomment-1321261688
2023-07-18 17:06:50 +02:00
if ! subscriber_count || ! subscriber_count . includes? " subscriber "
subscriber_count = item_contents . dig? ( " videoCountText " , " simpleText " ) . try & . as_s
2023-01-04 02:18:10 +01:00
end
subscriber_count = subscriber_count
2023-07-18 17:06:50 +02:00
. try { | s | short_text_to_number ( s . split ( " " ) [ 0 ] ) . to_i32 } || 0
2021-08-04 06:22:34 +02:00
2021-09-28 17:55:02 +02:00
# Auto-generated channels doesn't have videoCountText
# Taken from: https://github.com/iv-org/invidious/pull/2228#discussion_r717620922
auto_generated = item_contents [ " videoCountText " ]? . nil?
2021-08-03 09:22:31 +02:00
2021-08-04 06:22:34 +02:00
video_count = HelperExtractors . get_video_count ( item_contents )
2021-08-03 09:22:31 +02:00
description_html = item_contents [ " descriptionSnippet " ]? . try { | t | parse_content ( t ) } || " "
SearchChannel . new ( {
author : author ,
ucid : author_id ,
author_thumbnail : author_thumbnail ,
subscriber_count : subscriber_count ,
video_count : video_count ,
2023-07-18 17:06:50 +02:00
channel_handle : channel_handle ,
2021-08-03 09:22:31 +02:00
description_html : description_html ,
auto_generated : auto_generated ,
2022-06-01 23:07:18 +02:00
author_verified : author_verified ,
2021-08-03 09:22:31 +02:00
} )
2021-05-04 10:48:51 +02:00
end
2021-10-07 23:39:21 +02:00
def self . parser_name
return {{ @type . name }}
end
2021-05-04 10:48:51 +02:00
end
2023-07-15 15:41:04 +02:00
# Parses an Innertube `hashtagTileRenderer` into a `SearchHashtag`.
# Returns `nil` when the given object is not a `hashtagTileRenderer`.
#
# A `hashtagTileRenderer` is a kind of search result.
# It can be found when searching for any hashtag (e.g "#hi" or "#shorts")
module HashtagRendererParser
def self . process ( item : JSON :: Any , author_fallback : AuthorFallback )
if item_contents = item [ " hashtagTileRenderer " ]?
return self . parse ( item_contents )
end
end
private def self . parse ( item_contents )
title = extract_text ( item_contents [ " hashtag " ] ) . not_nil! # E.g "#hi"
# E.g "/hashtag/hi"
url = item_contents . dig? ( " onTapCommand " , " commandMetadata " , " webCommandMetadata " , " url " ) . try & . as_s
url || = URI . encode_path ( " /hashtag/ #{ title . lchop ( '#' ) } " )
video_count_txt = extract_text ( item_contents [ " hashtagVideoCount " ]? ) # E.g "203K videos"
channel_count_txt = extract_text ( item_contents [ " hashtagChannelCount " ]? ) # E.g "81K channels"
# Fallback for video/channel counts
if channel_count_txt . nil? || video_count_txt . nil?
# E.g: "203K videos • 81K channels"
info_text = extract_text ( item_contents [ " hashtagInfoText " ]? ) . try & . split ( " • " )
if info_text && info_text . size == 2
video_count_txt || = info_text [ 0 ]
channel_count_txt || = info_text [ 1 ]
end
end
return SearchHashtag . new ( {
title : title ,
url : url ,
video_count : short_text_to_number ( video_count_txt || " " ) ,
channel_count : short_text_to_number ( channel_count_txt || " " ) ,
} )
rescue ex
LOGGER . debug ( " HashtagRendererParser: Failed to extract renderer. " )
LOGGER . debug ( " HashtagRendererParser: Got exception: #{ ex . message } " )
return nil
end
def self . parser_name
return {{ @type . name }}
end
end
2021-08-06 05:31:48 +02:00
# Parses a InnerTube gridPlaylistRenderer into a SearchPlaylist. Returns nil when the given object isn't a gridPlaylistRenderer
#
# A gridPlaylistRenderer renders a playlist, that is located in a grid, to click on within the YouTube and Invidious UI.
# It is **not** the playlist itself.
#
# See specs for example.
#
# `gridPlaylistRenderer`s can be found on the playlist-tabs of channels and expanded categories.
#
2021-08-03 09:22:31 +02:00
module GridPlaylistRendererParser
def self . process ( item : JSON :: Any , author_fallback : AuthorFallback )
if item_contents = item [ " gridPlaylistRenderer " ]?
return self . parse ( item_contents , author_fallback )
end
2021-05-04 10:48:51 +02:00
end
2021-08-03 09:22:31 +02:00
private def self . parse ( item_contents , author_fallback )
2021-08-04 06:22:34 +02:00
title = extract_text ( item_contents [ " title " ] ) || " "
2021-08-03 09:22:31 +02:00
plid = item_contents [ " playlistId " ]? . try & . as_s || " "
2022-06-01 23:07:18 +02:00
author_verified = has_verified_badge? ( item_contents [ " ownerBadges " ]? )
2021-08-04 06:22:34 +02:00
video_count = HelperExtractors . get_video_count ( item_contents )
playlist_thumbnail = HelperExtractors . get_thumbnails ( item_contents )
2021-08-03 09:22:31 +02:00
SearchPlaylist . new ( {
2022-05-01 21:10:43 +02:00
title : title ,
id : plid ,
author : author_fallback . name ,
ucid : author_fallback . id ,
video_count : video_count ,
videos : [ ] of SearchPlaylistVideo ,
thumbnail : playlist_thumbnail ,
2022-06-01 23:07:18 +02:00
author_verified : author_verified ,
2021-08-03 09:22:31 +02:00
} )
end
2021-10-07 23:39:21 +02:00
def self . parser_name
return {{ @type . name }}
end
2021-05-04 10:48:51 +02:00
end
2021-08-06 05:31:48 +02:00
# Parses a InnerTube playlistRenderer into a SearchPlaylist. Returns nil when the given object isn't a playlistRenderer
#
# A playlistRenderer renders a playlist to click on within the YouTube and Invidious UI. It is **not** the playlist itself.
#
# See specs for example.
#
# `playlistRenderer`s can be found almost everywhere on YouTube. In categories, search results, recommended, etc.
#
2021-08-03 09:22:31 +02:00
module PlaylistRendererParser
def self . process ( item : JSON :: Any , author_fallback : AuthorFallback )
if item_contents = item [ " playlistRenderer " ]?
2021-10-07 23:39:21 +02:00
return self . parse ( item_contents , author_fallback )
2021-08-03 09:22:31 +02:00
end
2021-05-04 10:48:51 +02:00
end
2021-10-07 23:39:21 +02:00
private def self . parse ( item_contents , author_fallback )
2023-04-25 01:29:34 +02:00
title = extract_text ( item_contents [ " title " ] ) || " "
2021-08-03 09:22:31 +02:00
plid = item_contents [ " playlistId " ]? . try & . as_s || " "
2021-08-04 06:22:34 +02:00
video_count = HelperExtractors . get_video_count ( item_contents )
playlist_thumbnail = HelperExtractors . get_thumbnails_plural ( item_contents )
2021-08-03 09:22:31 +02:00
2021-10-07 23:39:21 +02:00
author_info = item_contents . dig? ( " shortBylineText " , " runs " , 0 )
author = author_info . try & . [ " text " ] . as_s || author_fallback . name
author_id = author_info . try { | x | HelperExtractors . get_browse_id ( x ) } || author_fallback . id
2022-06-01 23:07:18 +02:00
author_verified = has_verified_badge? ( item_contents [ " ownerBadges " ]? )
2021-08-03 09:22:31 +02:00
videos = item_contents [ " videos " ]? . try & . as_a . map do | v |
v = v [ " childVideoRenderer " ]
2021-08-04 06:22:34 +02:00
v_title = v . dig? ( " title " , " simpleText " ) . try & . as_s || " "
2021-08-03 09:22:31 +02:00
v_id = v [ " videoId " ]? . try & . as_s || " "
2021-08-04 06:22:34 +02:00
v_length_seconds = v . dig? ( " lengthText " , " simpleText " ) . try { | t | decode_length_seconds ( t . as_s ) } || 0
2021-08-03 09:22:31 +02:00
SearchPlaylistVideo . new ( {
title : v_title ,
id : v_id ,
length_seconds : v_length_seconds ,
} )
end || [ ] of SearchPlaylistVideo
# TODO: item_contents["publishedTimeText"]?
SearchPlaylist . new ( {
2022-05-01 21:10:43 +02:00
title : title ,
id : plid ,
author : author ,
ucid : author_id ,
video_count : video_count ,
videos : videos ,
thumbnail : playlist_thumbnail ,
2022-06-01 23:07:18 +02:00
author_verified : author_verified ,
2021-05-04 10:48:51 +02:00
} )
2021-08-03 09:22:31 +02:00
end
2021-10-07 23:39:21 +02:00
def self . parser_name
return {{ @type . name }}
end
2021-05-04 10:48:51 +02:00
end
2021-08-06 05:31:48 +02:00
# Parses a InnerTube shelfRenderer into a Category. Returns nil when the given object isn't a shelfRenderer
#
# A shelfRenderer renders divided sections on YouTube. IE "People also watched" in search results and
# the various organizational sections in the channel home page. A separate one (richShelfRenderer) is used
# for YouTube home. A shelfRenderer can also sometimes be expanded to show more content within it.
#
# See specs for example.
#
# `shelfRenderer`s can be found almost everywhere on YouTube. In categories, search results, channels, etc.
#
2021-08-03 09:22:31 +02:00
module CategoryRendererParser
def self . process ( item : JSON :: Any , author_fallback : AuthorFallback )
if item_contents = item [ " shelfRenderer " ]?
return self . parse ( item_contents , author_fallback )
end
2021-05-07 14:13:53 +02:00
end
2021-08-03 09:22:31 +02:00
private def self . parse ( item_contents , author_fallback )
2021-08-04 06:22:34 +02:00
title = extract_text ( item_contents [ " title " ]? ) || " "
2021-09-28 17:23:36 +02:00
url = item_contents . dig? ( " endpoint " , " commandMetadata " , " webCommandMetadata " , " url " )
. try & . as_s
2021-05-07 14:13:53 +02:00
2021-08-03 09:22:31 +02:00
# Sometimes a category can have badges.
badges = [ ] of Tuple ( String , String ) # (Badge style, label)
item_contents [ " badges " ]? . try & . as_a . each do | badge |
badge = badge [ " metadataBadgeRenderer " ]
badges << { badge [ " style " ] . as_s , badge [ " label " ] . as_s }
end
2021-05-07 14:13:53 +02:00
2021-08-03 09:22:31 +02:00
# Category description
description_html = item_contents [ " subtitle " ]? . try { | desc | parse_content ( desc ) } || " "
2021-05-09 05:07:07 +02:00
2021-08-03 09:22:31 +02:00
# Content parsing
contents = [ ] of SearchItem
2021-05-07 14:13:53 +02:00
2021-10-07 23:39:21 +02:00
# InnerTube recognizes some "special" categories, which are organized differently.
if special_category_container = item_contents [ " content " ]?
if content_container = special_category_container [ " horizontalListRenderer " ]?
elsif content_container = special_category_container [ " expandedShelfContentsRenderer " ]?
elsif content_container = special_category_container [ " verticalListRenderer " ]?
else
# Anything else, such as `horizontalMovieListRenderer` is currently unsupported.
return
end
2021-08-03 09:22:31 +02:00
else
2021-10-07 23:39:21 +02:00
# "Normal" category.
2021-08-03 09:22:31 +02:00
content_container = item_contents [ " contents " ]
end
2021-05-07 14:13:53 +02:00
2022-11-05 18:56:35 +01:00
content_container [ " items " ]? . try & . as_a . each do | item |
result = parse_item ( item , author_fallback . name , author_fallback . id )
contents << result if result . is_a? ( SearchItem )
2021-05-07 14:13:53 +02:00
end
2021-08-03 09:22:31 +02:00
Category . new ( {
title : title ,
contents : contents ,
description_html : description_html ,
url : url ,
badges : badges ,
} )
end
2021-10-07 23:39:21 +02:00
def self . parser_name
return {{ @type . name }}
end
2021-05-07 14:13:53 +02:00
end
2022-05-26 18:31:02 +02:00
2023-03-22 02:24:37 +01:00
# Parses an InnerTube itemSectionRenderer into a SearchVideo.
# Returns nil when the given object isn't a ItemSectionRenderer
#
2023-05-14 22:49:49 +02:00
# A itemSectionRenderer seems to be a simple wrapper for a videoRenderer or a playlistRenderer, used
2023-03-22 02:24:37 +01:00
# by the result page for channel searches. It is located inside a continuationItems
# container.It is very similar to RichItemRendererParser
#
module ItemSectionRendererParser
def self . process ( item : JSON :: Any , author_fallback : AuthorFallback )
if item_contents = item . dig? ( " itemSectionRenderer " , " contents " , 0 )
return self . parse ( item_contents , author_fallback )
end
end
private def self . parse ( item_contents , author_fallback )
child = VideoRendererParser . process ( item_contents , author_fallback )
2023-05-14 22:49:49 +02:00
child || = PlaylistRendererParser . process ( item_contents , author_fallback )
2023-03-22 02:24:37 +01:00
return child
end
def self . parser_name
return {{ @type . name }}
end
end
2022-05-26 18:31:02 +02:00
# Parses an InnerTube richItemRenderer into a SearchVideo.
2022-10-31 21:30:10 +01:00
# Returns nil when the given object isn't a RichItemRenderer
2022-05-26 18:31:02 +02:00
#
# A richItemRenderer seems to be a simple wrapper for a videoRenderer, used
2023-07-12 20:06:34 +02:00
# by the result page for hashtags and for the podcast tab on channels.
# It is located inside a continuationItems container for hashtags.
2022-05-26 18:31:02 +02:00
#
module RichItemRendererParser
def self . process ( item : JSON :: Any , author_fallback : AuthorFallback )
if item_contents = item . dig? ( " richItemRenderer " , " content " )
return self . parse ( item_contents , author_fallback )
end
end
private def self . parse ( item_contents , author_fallback )
2022-12-01 23:01:31 +01:00
child = VideoRendererParser . process ( item_contents , author_fallback )
child || = ReelItemRendererParser . process ( item_contents , author_fallback )
2023-07-12 20:06:34 +02:00
child || = PlaylistRendererParser . process ( item_contents , author_fallback )
2022-12-01 23:01:31 +01:00
return child
2022-05-26 18:31:02 +02:00
end
def self . parser_name
return {{ @type . name }}
end
end
2022-10-31 21:30:10 +01:00
# Parses an InnerTube reelItemRenderer into a SearchVideo.
# Returns nil when the given object isn't a reelItemRenderer
#
# reelItemRenderer items are used in the new (2022) channel layout,
# in the "shorts" tab.
#
module ReelItemRendererParser
def self . process ( item : JSON :: Any , author_fallback : AuthorFallback )
if item_contents = item [ " reelItemRenderer " ]?
return self . parse ( item_contents , author_fallback )
end
end
private def self . parse ( item_contents , author_fallback )
video_id = item_contents [ " videoId " ] . as_s
2023-01-08 13:50:52 +01:00
reel_player_overlay = item_contents . dig (
" navigationEndpoint " , " reelWatchEndpoint " ,
" overlay " , " reelPlayerOverlayRenderer "
)
2023-03-22 04:47:52 +01:00
if video_details_container = reel_player_overlay . dig? (
" reelPlayerHeaderSupportedRenderers " ,
" reelPlayerHeaderRenderer "
)
# Author infos
2022-10-31 21:30:10 +01:00
2023-03-22 04:47:52 +01:00
author = video_details_container
. dig? ( " channelTitleText " , " runs " , 0 , " text " )
. try & . as_s || author_fallback . name
2022-10-31 21:30:10 +01:00
2023-03-22 04:47:52 +01:00
ucid = video_details_container
. dig? ( " channelNavigationEndpoint " , " browseEndpoint " , " browseId " )
. try & . as_s || author_fallback . id
2022-10-31 21:30:10 +01:00
2023-03-22 04:47:52 +01:00
# Title & publication date
2022-10-31 21:30:10 +01:00
2023-03-22 04:47:52 +01:00
title = video_details_container . dig? ( " reelTitleText " )
. try { | t | extract_text ( t ) } || " "
2022-10-31 21:30:10 +01:00
2023-03-22 04:47:52 +01:00
published = video_details_container
. dig? ( " timestampText " , " simpleText " )
. try { | t | decode_date ( t . as_s ) } || Time . utc
2022-10-31 21:30:10 +01:00
2023-03-22 04:47:52 +01:00
# View count
view_count_text = video_details_container . dig? ( " viewCountText " , " simpleText " )
else
author = author_fallback . name
ucid = author_fallback . id
published = Time . utc
title = item_contents . dig? ( " headline " , " simpleText " ) . try & . as_s || " "
end
2022-10-31 21:30:10 +01:00
# View count
2022-12-01 23:01:31 +01:00
# View count used to be in the reelWatchEndpoint, but that changed?
2023-03-22 04:47:52 +01:00
view_count_text || = item_contents . dig? ( " viewCountText " , " simpleText " )
2022-10-31 21:30:10 +01:00
2023-04-02 22:45:34 +02:00
view_count = short_text_to_number ( view_count_text . try & . as_s || " 0 " )
2022-10-31 21:30:10 +01:00
# Duration
a11y_data = item_contents
. dig? ( " accessibility " , " accessibilityData " , " label " )
. try & . as_s || " "
regex_match = / - (?<min> \ d+ minutes? )?(?<sec> \ d+ seconds?)+ - / . match ( a11y_data )
2022-12-01 23:01:31 +01:00
minutes = regex_match . try & . [ " min " ]? . try & . to_i ( strict : false ) || 0
seconds = regex_match . try & . [ " sec " ]? . try & . to_i ( strict : false ) || 0
2022-10-31 21:30:10 +01:00
duration = ( minutes * 60 + seconds )
SearchVideo . new ( {
title : title ,
id : video_id ,
author : author ,
ucid : ucid ,
published : published ,
views : view_count ,
description_html : " " ,
length_seconds : duration ,
live_now : false ,
premium : false ,
premiere_timestamp : Time . unix ( 0 ) ,
author_verified : false ,
} )
end
def self . parser_name
return {{ @type . name }}
end
end
2022-11-05 18:56:35 +01:00
# Parses an InnerTube continuationItemRenderer into a Continuation.
# Returns nil when the given object isn't a continuationItemRenderer.
#
# continuationItemRenderer contains various metadata ued to load more
# content (i.e when the user scrolls down). The interesting bit is the
# protobuf object known as the "continutation token". Previously, those
# were generated from sratch, but recent (as of 11/2022) Youtube changes
# are forcing us to extract them from replies.
#
module ContinuationItemRendererParser
def self . process ( item : JSON :: Any , author_fallback : AuthorFallback )
if item_contents = item [ " continuationItemRenderer " ]?
return self . parse ( item_contents )
end
end
private def self . parse ( item_contents )
token = item_contents
. dig? ( " continuationEndpoint " , " continuationCommand " , " token " )
. try & . as_s
return Continuation . new ( token ) if token
end
def self . parser_name
return {{ @type . name }}
end
end
2021-05-07 14:13:53 +02:00
end
2021-05-04 10:48:51 +02:00
# The following are the extractors for extracting an array of items from
# the internal Youtube API's JSON response. The result is then packaged into
# a structure we can more easily use via the parsers above. Their internals are
# identical to the item parsers.
2021-08-06 05:31:48 +02:00
# Namespace for logic relating to extracting InnerTube's initial response to items we can parse.
#
# Each of the extractors in this namespace are accessed through the #process() method
# which validates the given data as applicable to itself. If it is applicable the given
# data is passed to the private `#extract()` method which returns an array of
# parsable items. Otherwise, nil is returned.
#
# NOTE perhaps the result from here should be abstracted into a struct in order to
# get additional metadata regarding the container of the item(s).
2021-08-03 09:22:31 +02:00
private module Extractors
2021-08-06 05:31:48 +02:00
# Extracts items from the selected YouTube tab.
#
# YouTube tabs are typically stored under "twoColumnBrowseResultsRenderer"
# and is structured like this:
#
# "twoColumnBrowseResultsRenderer": {
# {"tabs": [
# {"tabRenderer": {
# "endpoint": {...}
# "title": "Playlists",
2022-07-03 14:03:30 +02:00
# "selected": true, # Is nil unless tab is selected
2021-08-06 05:31:48 +02:00
# "content": {...},
# ...
# }}
# ]}
# }]
#
2021-08-03 09:22:31 +02:00
module YouTubeTabs
2022-11-10 23:32:51 +01:00
def self . process ( initial_data : InitialData )
2021-08-03 09:22:31 +02:00
if target = initial_data [ " twoColumnBrowseResultsRenderer " ]?
self . extract ( target )
end
end
2021-05-04 10:48:51 +02:00
2021-08-03 09:22:31 +02:00
private def self . extract ( target )
raw_items = [ ] of JSON :: Any
2021-08-04 06:22:34 +02:00
content = extract_selected_tab ( target [ " tabs " ] ) [ " content " ]
2021-08-03 09:22:31 +02:00
2022-08-06 18:41:59 +02:00
if section_list_contents = content . dig? ( " sectionListRenderer " , " contents " )
2022-10-31 20:40:43 +01:00
raw_items = unpack_section_list ( section_list_contents )
elsif rich_grid_contents = content . dig? ( " richGridRenderer " , " contents " )
raw_items = rich_grid_contents . as_a
end
2021-08-03 09:22:31 +02:00
2022-10-31 20:40:43 +01:00
return raw_items
end
private def self . unpack_section_list ( contents )
raw_items = [ ] of JSON :: Any
2023-07-16 17:23:23 +02:00
contents . as_a . each do | item |
if item_section_content = item . dig? ( " itemSectionRenderer " , " contents " )
raw_items += self . unpack_item_section ( item_section_content )
2022-10-31 20:40:43 +01:00
else
2023-07-16 17:23:23 +02:00
raw_items << item
2022-10-31 20:40:43 +01:00
end
2023-07-16 17:23:23 +02:00
end
2022-10-31 20:40:43 +01:00
2023-07-16 17:23:23 +02:00
return raw_items
end
private def self . unpack_item_section ( contents )
raw_items = [ ] of JSON :: Any
contents . as_a . each do | item |
# Category extraction
if container = item . dig? ( " gridRenderer " , " items " ) || item . dig? ( " items " )
raw_items += container . as_a
else
2022-10-31 20:40:43 +01:00
raw_items << item
2021-08-03 09:22:31 +02:00
end
end
2021-05-04 10:48:51 +02:00
2021-08-03 09:22:31 +02:00
return raw_items
2021-05-04 10:48:51 +02:00
end
2021-10-07 23:39:21 +02:00
def self . extractor_name
return {{ @type . name }}
end
2021-05-04 10:48:51 +02:00
end
2021-08-06 05:31:48 +02:00
# Extracts items from the InnerTube response for search results
#
# Search results are typically stored under "twoColumnSearchResultsRenderer"
# and is structured like this:
#
# "twoColumnSearchResultsRenderer": {
# {"primaryContents": {
# {"sectionListRenderer": {
# "contents": [...],
# ...,
# "subMenu": {...},
# "hideBottomSeparator": true,
# "targetId": "search-feed"
# }}
# }}
# }
#
2021-08-03 09:22:31 +02:00
module SearchResults
2022-11-10 23:32:51 +01:00
def self . process ( initial_data : InitialData )
2021-08-03 09:22:31 +02:00
if target = initial_data [ " twoColumnSearchResultsRenderer " ]?
self . extract ( target )
2021-05-04 10:48:51 +02:00
end
2021-08-03 09:22:31 +02:00
end
2021-05-04 10:48:51 +02:00
2021-08-03 09:22:31 +02:00
private def self . extract ( target )
raw_items = [ ] of Array ( JSON :: Any )
2021-08-04 06:22:34 +02:00
target . dig ( " primaryContents " , " sectionListRenderer " , " contents " ) . as_a . each do | node |
2021-08-03 09:22:31 +02:00
if node = node [ " itemSectionRenderer " ]?
raw_items << node [ " contents " ] . as_a
end
2021-05-04 10:48:51 +02:00
end
2021-08-04 06:22:34 +02:00
return raw_items . flatten
2021-05-04 10:48:51 +02:00
end
2021-10-07 23:39:21 +02:00
def self . extractor_name
return {{ @type . name }}
end
2021-05-04 10:48:51 +02:00
end
2021-08-06 05:31:48 +02:00
# Extracts continuation items from a InnerTube response
#
# Continuation items (on YouTube) are items which are appended to the
# end of the page for continuous scrolling. As such, in many cases,
# the items are lacking information such as author or category title,
# since the original results has already rendered them on the top of the page.
#
# The way they are structured is too varied to be accurately written down here.
# However, they all eventually lead to an array of parsable items after traversing
# through the JSON structure.
2022-11-10 23:32:51 +01:00
module ContinuationContent
def self . process ( initial_data : InitialData )
2021-08-03 09:22:31 +02:00
if target = initial_data [ " continuationContents " ]?
self . extract ( target )
elsif target = initial_data [ " appendContinuationItemsAction " ]?
self . extract ( target )
2022-06-01 23:17:28 +02:00
elsif target = initial_data [ " reloadContinuationItemsCommand " ]?
self . extract ( target )
2021-05-08 12:43:26 +02:00
end
end
2021-08-03 09:22:31 +02:00
private def self . extract ( target )
2022-10-31 20:40:43 +01:00
content = target [ " continuationItems " ]?
content || = target . dig? ( " gridContinuation " , " items " )
content || = target . dig? ( " richGridContinuation " , " contents " )
2021-05-04 10:48:51 +02:00
2022-10-31 20:40:43 +01:00
return content . nil? ? [ ] of JSON :: Any : content . as_a
2021-05-04 10:48:51 +02:00
end
2021-10-07 23:39:21 +02:00
def self . extractor_name
return {{ @type . name }}
end
2021-05-04 10:48:51 +02:00
end
end
2021-08-06 05:31:48 +02:00
# Helper methods to aid in the parsing of InnerTube to data structs.
#
# Mostly used to extract out repeated structures to deal with code
# repetition.
2022-02-03 00:44:52 +01:00
module HelperExtractors
2021-08-04 06:22:34 +02:00
# Retrieves the amount of videos present within the given InnerTube data.
#
# Returns a 0 when it's unable to do so
def self . get_video_count ( container : JSON :: Any ) : Int32
if box = container [ " videoCountText " ]?
2023-01-17 00:43:58 +01:00
if ( extracted_text = extract_text ( box ) ) && ! extracted_text . includes? " subscriber "
return extracted_text . gsub ( / \ D / , " " ) . to_i
else
return 0
end
2021-08-04 06:22:34 +02:00
elsif box = container [ " videoCount " ]?
return box . as_s . to_i
else
return 0
end
end
2022-02-03 00:57:44 +01:00
# Retrieves the amount of views/viewers a video has.
# Seems to be used on related videos only
#
# Returns "0" when unable to parse
def self . get_short_view_count ( container : JSON :: Any ) : String
box = container [ " shortViewCountText " ]?
return " 0 " if ! box
# Simpletext: "4M views"
# runs: {"text": "1.1K"},{"text":" watching"}
return box [ " simpleText " ]? . try & . as_s . sub ( " views " , " " ) ||
box . dig? ( " runs " , 0 , " text " ) . try & . as_s || " 0 "
end
2021-08-04 06:22:34 +02:00
# Retrieve lowest quality thumbnail from InnerTube data
#
# TODO allow configuration of image quality (-1 is highest)
#
# Raises when it's unable to parse from the given JSON data.
def self . get_thumbnails ( container : JSON :: Any ) : String
return container . dig ( " thumbnail " , " thumbnails " , 0 , " url " ) . as_s
end
# ditto
2021-08-06 05:31:48 +02:00
#
2021-08-04 06:22:34 +02:00
# YouTube sometimes sends the thumbnail as:
# {"thumbnails": [{"thumbnails": [{"url": "example.com"}, ...]}]}
def self . get_thumbnails_plural ( container : JSON :: Any ) : String
return container . dig ( " thumbnails " , 0 , " thumbnails " , 0 , " url " ) . as_s
end
2021-08-06 05:31:48 +02:00
# Retrieves the ID required for querying the InnerTube browse endpoint.
2023-06-07 22:04:14 +02:00
# Returns an empty string when it's unable to do so
2021-09-28 17:39:00 +02:00
def self . get_browse_id ( container )
2023-06-07 22:04:14 +02:00
return container . dig? ( " navigationEndpoint " , " browseEndpoint " , " browseId " ) . try & . as_s || " "
2021-08-04 06:22:34 +02:00
end
end
2021-06-29 18:23:48 +02:00
# Parses an item from Youtube's JSON response into a more usable structure.
# The end result can either be a SearchVideo, SearchPlaylist or SearchChannel.
2022-11-10 23:32:51 +01:00
def parse_item ( item : JSON :: Any , author_fallback : String ? = " " , author_id_fallback : String ? = " " )
2021-09-28 17:19:55 +02:00
# We "allow" nil values but secretly use empty strings instead. This is to save us the
# hassle of modifying every author_fallback and author_id_fallback arg usage
# which is more often than not nil.
author_fallback = AuthorFallback . new ( author_fallback || " " , author_id_fallback || " " )
2021-05-04 10:48:51 +02:00
# Cycles through all of the item parsers and attempt to parse the raw YT JSON data.
# Each parser automatically validates the data given to see if the data is
2022-02-07 13:57:14 +01:00
# applicable to itself. If not nil is returned and the next parser is attempted.
2021-05-04 10:48:51 +02:00
ITEM_PARSERS . each do | parser |
2022-11-10 23:32:51 +01:00
LOGGER . trace ( " parse_item: Attempting to parse item using \" #{ parser . parser_name } \" (cycling...) " )
2021-10-07 23:39:21 +02:00
2021-09-28 17:50:23 +02:00
if result = parser . process ( item , author_fallback )
2022-11-10 23:32:51 +01:00
LOGGER . debug ( " parse_item: Successfully parsed via #{ parser . parser_name } " )
2021-05-04 10:48:51 +02:00
return result
2021-10-07 23:39:21 +02:00
else
2022-11-10 23:32:51 +01:00
LOGGER . trace ( " parse_item: Parser \" #{ parser . parser_name } \" does not apply. Cycling to the next one... " )
2021-05-04 10:48:51 +02:00
end
end
end
2021-08-05 04:54:41 +02:00
# Parses multiple items from YouTube's initial JSON response into a more usable structure.
2021-06-29 18:23:48 +02:00
# The end result is an array of SearchItem.
2022-11-10 23:32:51 +01:00
#
# This function yields the container so that items can be parsed separately.
#
def extract_items ( initial_data : InitialData , & block )
2021-05-07 14:13:53 +02:00
if unpackaged_data = initial_data [ " contents " ]? . try & . as_h
elsif unpackaged_data = initial_data [ " response " ]? . try & . as_h
2023-03-31 22:08:09 +02:00
elsif unpackaged_data = initial_data . dig? ( " onResponseReceivedActions " , 1 ) . try & . as_h
2021-09-28 17:50:23 +02:00
elsif unpackaged_data = initial_data . dig? ( " onResponseReceivedActions " , 0 ) . try & . as_h
2021-05-07 14:13:53 +02:00
else
unpackaged_data = initial_data
end
2021-05-04 10:48:51 +02:00
2022-11-10 23:32:51 +01:00
# This is identical to the parser cycling of parse_item().
2021-05-04 10:48:51 +02:00
ITEM_CONTAINER_EXTRACTOR . each do | extractor |
2021-10-07 23:39:21 +02:00
LOGGER . trace ( " extract_items: Attempting to extract item container using \" #{ extractor . extractor_name } \" (cycling...) " )
2021-09-28 17:50:23 +02:00
if container = extractor . process ( unpackaged_data )
2021-10-07 23:39:21 +02:00
LOGGER . debug ( " extract_items: Successfully unpacked container with \" #{ extractor . extractor_name } \" " )
2021-09-28 17:50:23 +02:00
# Extract items in container
2022-11-10 23:32:51 +01:00
container . each { | item | yield item }
2021-10-07 23:39:21 +02:00
else
LOGGER . trace ( " extract_items: Extractor \" #{ extractor . extractor_name } \" does not apply. Cycling to the next one... " )
2021-05-04 10:48:51 +02:00
end
end
2022-11-10 23:32:51 +01:00
end
# Wrapper using the block function above
def extract_items (
initial_data : InitialData ,
author_fallback : String ? = nil ,
author_id_fallback : String ? = nil
2022-11-05 18:56:35 +01:00
) : { Array ( SearchItem ) , String ?}
2022-11-10 23:32:51 +01:00
items = [ ] of SearchItem
2022-11-05 18:56:35 +01:00
continuation = nil
2022-11-10 23:32:51 +01:00
extract_items ( initial_data ) do | item |
parsed = parse_item ( item , author_fallback , author_id_fallback )
2022-11-05 18:56:35 +01:00
case parsed
when . is_a? ( Continuation ) then continuation = parsed . token
when . is_a? ( SearchItem ) then items << parsed
end
2022-11-10 23:32:51 +01:00
end
2021-05-04 10:48:51 +02:00
2022-11-05 18:56:35 +01:00
return items , continuation
2021-05-04 10:48:51 +02:00
end