From 39019757485c242a502afdedd900ce801fe3ce44 Mon Sep 17 00:00:00 2001 From: Samantaz Fox Date: Sun, 1 Oct 2023 19:39:53 +0200 Subject: [PATCH 1/4] Add a youtube URL sanitizer --- src/invidious/yt_backend/url_sanitizer.cr | 121 ++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 src/invidious/yt_backend/url_sanitizer.cr diff --git a/src/invidious/yt_backend/url_sanitizer.cr b/src/invidious/yt_backend/url_sanitizer.cr new file mode 100644 index 00000000..02bf77bf --- /dev/null +++ b/src/invidious/yt_backend/url_sanitizer.cr @@ -0,0 +1,121 @@ +require "uri" + +module UrlSanitizer + extend self + + ALLOWED_QUERY_PARAMS = { + channel: ["u", "user", "lb"], + playlist: ["list"], + search: ["q", "search_query", "sp"], + watch: [ + "v", # Video ID + "list", "index", # Playlist-related + "playlist", # Unnamed playlist (id,id,id,...) (embed-only?) + "t", "time_continue", "start", "end", # Timestamp + "lc", # Highlighted comment (watch page only) + ], + } + + # Returns wether the given string is an ASCII word. This is the same as + # running the following regex in US-ASCII locale: /^[\w-]+$/ + private def ascii_word?(str : String) : Bool + if str.bytesize == str.size + str.each_byte do |byte| + next if 'a'.ord <= byte <= 'z'.ord + next if 'A'.ord <= byte <= 'Z'.ord + next if '0'.ord <= byte <= '9'.ord + next if byte == '-'.ord || byte == '_'.ord + + return false + end + + return true + else + return false + end + end + + # Return which kind of parameters are allowed based on the + # first path component (breadcrumb 0). + private def determine_allowed(path_root : String) + case path_root + when "watch", "w", "v", "embed", "e", "shorts", "clip" + return :watch + when .starts_with?("@"), "c", "channel", "user", "profile", "attribution_link" + return :channel + when "playlist", "mix" + return :playlist + when "results", "search" + return :search + else # hashtag, post, trending, brand URLs, etc.. + return nil + end + end + + # Create a new URI::Param containing only the allowed parameters + private def copy_params(unsafe_params : URI::Params, allowed_type) : URI::Params + new_params = URI::Params.new + + ALLOWED_QUERY_PARAMS[allowed_type].each do |name| + if unsafe_params[name]? + # Only copy the last parameter, in case there is more than one + new_params[name] = unsafe_params.fetch_all(name)[-1] + end + end + + return new_params + end + + # Transform any user-supplied youtube URL into something we can trust + # and use across the code. + def process(str : String) : URI + # Because URI follows RFC3986 specifications, URL without a scheme + # will be parsed as a relative path. So we have to add a scheme ourselves. + str = "https://#{str}" if !str.starts_with?(/https?:\/\//) + + unsafe_uri = URI.parse(str) + new_uri = URI.new(path: "/") + + # Redirect to homepage for bogus URLs + return new_uri if (unsafe_uri.host.nil? || unsafe_uri.path.nil?) + + breadcrumbs = unsafe_uri.path + .split('/', remove_empty: true) + .compact_map do |bc| + # Exclude attempts at path trasversal + next if bc == "." || bc == ".." + + # Non-alnum characters are unlikely in a genuine URL + next if !ascii_word?(bc) + + bc + end + + # If nothing remains, it's either a legit URL to the homepage + # (who does that!?) or because we filtered some junk earlier. + return new_uri if breadcrumbs.empty? + + # Replace the original query parameters with the sanitized ones + case unsafe_uri.host.not_nil! + when .ends_with?("youtube.com") + # Use our sanitized path (not forgetting the leading '/') + new_uri.path = "/#{breadcrumbs.join('/')}" + + # Then determine which params are allowed, and copy them over + if allowed = determine_allowed(breadcrumbs[0]) + new_uri.query_params = copy_params(unsafe_uri.query_params, allowed) + end + when "youtu.be" + # Always redirect to the watch page + new_uri.path = "/watch" + + new_params = copy_params(unsafe_uri.query_params, :watch) + new_params["id"] = breadcrumbs[0] + + new_uri.query_params = new_params + end + + new_uri.host = nil # Safety measure + return new_uri + end +end From c28f72a52737728af665da91ad5e6761aba6eaeb Mon Sep 17 00:00:00 2001 From: Samantaz Fox Date: Thu, 5 Oct 2023 23:01:44 +0200 Subject: [PATCH 2/4] Search: Add support for youtu.be and youtube.com URLs --- src/invidious/routes/search.cr | 6 ++++++ src/invidious/search/query.cr | 27 +++++++++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/invidious/routes/search.cr b/src/invidious/routes/search.cr index 5be33533..85aa1c7e 100644 --- a/src/invidious/routes/search.cr +++ b/src/invidious/routes/search.cr @@ -51,6 +51,12 @@ module Invidious::Routes::Search else user = env.get? "user" + # An URL was copy/pasted in the search box. + # Redirect the user to the appropriate page. + if query.is_url? + return env.redirect UrlSanitizer.process(query.text).to_s + end + begin items = query.process rescue ex : ChannelSearchException diff --git a/src/invidious/search/query.cr b/src/invidious/search/query.cr index e38845d9..f87c243e 100644 --- a/src/invidious/search/query.cr +++ b/src/invidious/search/query.cr @@ -48,11 +48,12 @@ module Invidious::Search ) # Get the raw search query string (common to all search types). In # Regular search mode, also look for the `search_query` URL parameter - if @type.regular? - @raw_query = params["q"]? || params["search_query"]? || "" - else - @raw_query = params["q"]? || "" - end + _raw_query = params["q"]? + _raw_query ||= params["search_query"]? if @type.regular? + _raw_query ||= "" + + # Remove surrounding whitespaces. Mostly useful for copy/pasted URLs. + @raw_query = _raw_query.strip # Get the page number (also common to all search types) @page = params["page"]?.try &.to_i? || 1 @@ -85,7 +86,7 @@ module Invidious::Search @filters = Filters.from_iv_params(params) @channel = params["channel"]? || "" - if @filters.default? && @raw_query.includes?(':') + if @filters.default? && @raw_query.index(/\w:\w/) # Parse legacy filters from query @filters, @channel, @query, subs = Filters.from_legacy_filters(@raw_query) else @@ -136,5 +137,19 @@ module Invidious::Search return params end + + # Checks if the query is a standalone URL + def is_url? : Bool + # Only supported in regular search mode + return false if !@type.regular? + + # If filters are present, that's a regular search + return false if !@filters.default? + + # Simple heuristics: domain name + return @raw_query.starts_with?( + /(https?:\/\/)?(www\.)?(m\.)?youtu(\.be|be\.com)\// + ) + end end end From c05e7c81f03f1249782fe7d7999342129a72f102 Mon Sep 17 00:00:00 2001 From: Samantaz Fox Date: Tue, 13 Feb 2024 21:46:12 +0100 Subject: [PATCH 3/4] Search: Add URL search inhibition logic --- src/invidious/search/query.cr | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/invidious/search/query.cr b/src/invidious/search/query.cr index f87c243e..b3db0f63 100644 --- a/src/invidious/search/query.cr +++ b/src/invidious/search/query.cr @@ -20,6 +20,9 @@ module Invidious::Search property region : String? property channel : String = "" + # Flag that indicates if the smart search features have been disabled. + @inhibit_ssf : Bool = false + # Return true if @raw_query is either `nil` or empty private def empty_raw_query? return @raw_query.empty? @@ -55,6 +58,13 @@ module Invidious::Search # Remove surrounding whitespaces. Mostly useful for copy/pasted URLs. @raw_query = _raw_query.strip + # Check for smart features (ex: URL search) inhibitor (exclamation mark). + # If inhibitor is present, remove it. + if @raw_query.starts_with?('!') + @inhibit_ssf = true + @raw_query = @raw_query[1..] + end + # Get the page number (also common to all search types) @page = params["page"]?.try &.to_i? || 1 @@ -140,6 +150,9 @@ module Invidious::Search # Checks if the query is a standalone URL def is_url? : Bool + # If the smart features have been inhibited, don't go further. + return false if @inhibit_ssf + # Only supported in regular search mode return false if !@type.regular? From b83f9a0a58b198266462d5a6777714f61f6825d2 Mon Sep 17 00:00:00 2001 From: Samantaz Fox Date: Sat, 17 Feb 2024 14:27:25 +0100 Subject: [PATCH 4/4] Misc: Clean some code in UrlSanitizer --- src/invidious/yt_backend/url_sanitizer.cr | 30 +++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/invidious/yt_backend/url_sanitizer.cr b/src/invidious/yt_backend/url_sanitizer.cr index 02bf77bf..725382ee 100644 --- a/src/invidious/yt_backend/url_sanitizer.cr +++ b/src/invidious/yt_backend/url_sanitizer.cr @@ -16,23 +16,21 @@ module UrlSanitizer ], } - # Returns wether the given string is an ASCII word. This is the same as + # Returns whether the given string is an ASCII word. This is the same as # running the following regex in US-ASCII locale: /^[\w-]+$/ private def ascii_word?(str : String) : Bool - if str.bytesize == str.size - str.each_byte do |byte| - next if 'a'.ord <= byte <= 'z'.ord - next if 'A'.ord <= byte <= 'Z'.ord - next if '0'.ord <= byte <= '9'.ord - next if byte == '-'.ord || byte == '_'.ord + return false if str.bytesize != str.size - return false - end + str.each_byte do |byte| + next if 'a'.ord <= byte <= 'z'.ord + next if 'A'.ord <= byte <= 'Z'.ord + next if '0'.ord <= byte <= '9'.ord + next if byte == '-'.ord || byte == '_'.ord - return true - else return false end + + return true end # Return which kind of parameters are allowed based on the @@ -74,12 +72,15 @@ module UrlSanitizer str = "https://#{str}" if !str.starts_with?(/https?:\/\//) unsafe_uri = URI.parse(str) + unsafe_host = unsafe_uri.host + unsafe_path = unsafe_uri.path + new_uri = URI.new(path: "/") # Redirect to homepage for bogus URLs - return new_uri if (unsafe_uri.host.nil? || unsafe_uri.path.nil?) + return new_uri if (unsafe_host.nil? || unsafe_path.nil?) - breadcrumbs = unsafe_uri.path + breadcrumbs = unsafe_path .split('/', remove_empty: true) .compact_map do |bc| # Exclude attempts at path trasversal @@ -96,7 +97,7 @@ module UrlSanitizer return new_uri if breadcrumbs.empty? # Replace the original query parameters with the sanitized ones - case unsafe_uri.host.not_nil! + case unsafe_host when .ends_with?("youtube.com") # Use our sanitized path (not forgetting the leading '/') new_uri.path = "/#{breadcrumbs.join('/')}" @@ -115,7 +116,6 @@ module UrlSanitizer new_uri.query_params = new_params end - new_uri.host = nil # Safety measure return new_uri end end