Parse provider API urls + use new provider api in runAll scrape

This commit is contained in:
mrjvs 2023-12-18 21:47:19 +01:00
parent de30929dd6
commit a52fac701a
3 changed files with 249 additions and 92 deletions

View file

@ -1,7 +1,11 @@
import { ScrapeMedia } from "@movie-web/providers";
import {
FullScraperEvents,
RunOutput,
ScrapeMedia,
} from "@movie-web/providers";
import { RefObject, useCallback, useEffect, useRef, useState } from "react";
import { providers } from "@/utils/providers";
import { getLoadbalancedProviderApiUrl, providers } from "@/utils/providers";
export interface ScrapingItems {
id: string;
@ -18,96 +22,169 @@ export interface ScrapingSegment {
percentage: number;
}
export function useScrape() {
type ScraperEvent<Event extends keyof FullScraperEvents> = Parameters<
NonNullable<FullScraperEvents[Event]>
>[0];
function useBaseScrape() {
const [sources, setSources] = useState<Record<string, ScrapingSegment>>({});
const [sourceOrder, setSourceOrder] = useState<ScrapingItems[]>([]);
const [currentSource, setCurrentSource] = useState<string>();
const lastId = useRef<string | null>(null);
const initEvent = useCallback((evt: ScraperEvent<"init">) => {
setSources(
evt.sourceIds
.map((v) => {
const source = providers.getMetadata(v);
if (!source) throw new Error("invalid source id");
const out: ScrapingSegment = {
name: source.name,
id: source.id,
status: "waiting",
percentage: 0,
};
return out;
})
.reduce<Record<string, ScrapingSegment>>((a, v) => {
a[v.id] = v;
return a;
}, {})
);
setSourceOrder(evt.sourceIds.map((v) => ({ id: v, children: [] })));
}, []);
const startEvent = useCallback((id: ScraperEvent<"start">) => {
setSources((s) => {
if (s[id]) s[id].status = "pending";
return { ...s };
});
setCurrentSource(id);
lastId.current = id;
}, []);
const updateEvent = useCallback((evt: ScraperEvent<"update">) => {
setSources((s) => {
if (s[evt.id]) {
s[evt.id].status = evt.status;
s[evt.id].reason = evt.reason;
s[evt.id].error = evt.error;
s[evt.id].percentage = evt.percentage;
}
return { ...s };
});
}, []);
const discoverEmbedsEvent = useCallback(
(evt: ScraperEvent<"discoverEmbeds">) => {
setSources((s) => {
evt.embeds.forEach((v) => {
const source = providers.getMetadata(v.embedScraperId);
if (!source) throw new Error("invalid source id");
const out: ScrapingSegment = {
embedId: v.embedScraperId,
name: source.name,
id: v.id,
status: "waiting",
percentage: 0,
};
s[v.id] = out;
});
return { ...s };
});
setSourceOrder((s) => {
const source = s.find((v) => v.id === evt.sourceId);
if (!source) throw new Error("invalid source id");
source.children = evt.embeds.map((v) => v.id);
return [...s];
});
},
[]
);
const startScrape = useCallback(() => {
lastId.current = null;
}, []);
const getResult = useCallback((output: RunOutput | null) => {
if (output && lastId.current) {
setSources((s) => {
if (!lastId.current) return s;
if (s[lastId.current]) s[lastId.current].status = "success";
return { ...s };
});
}
return output;
}, []);
return {
initEvent,
startEvent,
updateEvent,
discoverEmbedsEvent,
startScrape,
getResult,
sources,
sourceOrder,
currentSource,
};
}
export function useScrape() {
const {
sources,
sourceOrder,
currentSource,
updateEvent,
discoverEmbedsEvent,
initEvent,
getResult,
startEvent,
startScrape,
} = useBaseScrape();
const startScraping = useCallback(
async (media: ScrapeMedia) => {
if (!providers) return null;
const providerApiUrl = getLoadbalancedProviderApiUrl();
if (providerApiUrl) {
startScrape();
const sseOutput = await new Promise<RunOutput | null>(
(resolve, reject) => {
const scrapeEvents = new EventSource(providerApiUrl);
scrapeEvents.addEventListener("error", (err) => reject(err));
scrapeEvents.addEventListener("init", (e) => initEvent(e.data));
scrapeEvents.addEventListener("start", (e) => startEvent(e.data));
scrapeEvents.addEventListener("update", (e) => updateEvent(e.data));
scrapeEvents.addEventListener("discoverEmbeds", (e) =>
discoverEmbedsEvent(e.data)
);
scrapeEvents.addEventListener("finish", (e) => resolve(e.data));
}
);
return getResult(sseOutput);
}
let lastId: string | null = null;
if (!providers) return null;
startScrape();
const output = await providers.runAll({
media,
events: {
init(evt) {
setSources(
evt.sourceIds
.map((v) => {
const source = providers.getMetadata(v);
if (!source) throw new Error("invalid source id");
const out: ScrapingSegment = {
name: source.name,
id: source.id,
status: "waiting",
percentage: 0,
};
return out;
})
.reduce<Record<string, ScrapingSegment>>((a, v) => {
a[v.id] = v;
return a;
}, {})
);
setSourceOrder(evt.sourceIds.map((v) => ({ id: v, children: [] })));
},
start(id) {
setSources((s) => {
if (s[id]) s[id].status = "pending";
return { ...s };
});
setCurrentSource(id);
lastId = id;
},
update(evt) {
setSources((s) => {
if (s[evt.id]) {
s[evt.id].status = evt.status;
s[evt.id].reason = evt.reason;
s[evt.id].error = evt.error;
s[evt.id].percentage = evt.percentage;
}
return { ...s };
});
},
discoverEmbeds(evt) {
setSources((s) => {
evt.embeds.forEach((v) => {
const source = providers.getMetadata(v.embedScraperId);
if (!source) throw new Error("invalid source id");
const out: ScrapingSegment = {
embedId: v.embedScraperId,
name: source.name,
id: v.id,
status: "waiting",
percentage: 0,
};
s[v.id] = out;
});
return { ...s };
});
setSourceOrder((s) => {
const source = s.find((v) => v.id === evt.sourceId);
if (!source) throw new Error("invalid source id");
source.children = evt.embeds.map((v) => v.id);
return [...s];
});
},
init: initEvent,
start: startEvent,
update: updateEvent,
discoverEmbeds: discoverEmbedsEvent,
},
});
if (output && lastId) {
setSources((s) => {
if (!lastId) return s;
if (s[lastId]) s[lastId].status = "success";
return { ...s };
});
}
return output;
return getResult(output);
},
[setSourceOrder, setSources]
[
initEvent,
startEvent,
updateEvent,
discoverEmbedsEvent,
getResult,
startScrape,
]
);
return {

View file

@ -7,22 +7,25 @@ import {
targets,
} from "@movie-web/providers";
import { conf } from "@/setup/config";
import { useAuthStore } from "@/stores/auth";
import { getProviderApiUrls, getProxyUrls } from "@/utils/proxyUrls";
const originalUrls = conf().PROXY_URLS;
let fetchersIndex = -1;
export function getLoadbalancedProxyUrl() {
const fetchers = useAuthStore.getState().proxySet ?? originalUrls;
if (fetchersIndex === -1 || fetchersIndex >= fetchers.length) {
fetchersIndex = Math.floor(Math.random() * fetchers.length);
}
const proxyUrl = fetchers[fetchersIndex];
fetchersIndex = (fetchersIndex + 1) % fetchers.length;
return proxyUrl;
function makeLoadbalancedList(getter: () => string[]) {
let listIndex = -1;
return () => {
const fetchers = getter();
if (listIndex === -1 || listIndex >= fetchers.length) {
listIndex = Math.floor(Math.random() * fetchers.length);
}
const proxyUrl = fetchers[listIndex];
listIndex = (listIndex + 1) % fetchers.length;
return proxyUrl;
};
}
const getLoadbalancedProxyUrl = makeLoadbalancedList(getProxyUrls);
export const getLoadbalancedProviderApiUrl =
makeLoadbalancedList(getProviderApiUrls);
function makeLoadBalancedSimpleProxyFetcher() {
const fetcher: ProviderBuilderOptions["fetcher"] = (a, b) => {
const currentFetcher = makeSimpleProxyFetcher(

77
src/utils/proxyUrls.ts Normal file
View file

@ -0,0 +1,77 @@
import { conf } from "@/setup/config";
import { useAuthStore } from "@/stores/auth";
const originalUrls = conf().PROXY_URLS;
const types = ["proxy", "api"] as const;
type ParsedUrlType = (typeof types)[number];
export interface ParsedUrl {
url: string;
type: ParsedUrlType;
}
function canParseUrl(url: string): boolean {
try {
return !!new URL(url);
} catch {
return false;
}
}
function isParsedUrlType(type: string): type is ParsedUrlType {
return types.includes(type as any);
}
/**
* Turn a string like "a=b,c=d,d=e" into a dictionary object
*/
function parseParams(input: string): Record<string, string> {
const entriesParams = input
.split(",")
.map((param) => param.split("=", 2).filter((part) => part.length !== 0))
.filter((v) => v.length === 2);
return Object.fromEntries(entriesParams);
}
export function getParsedUrls() {
const urls = useAuthStore.getState().proxySet ?? originalUrls;
const output: ParsedUrl[] = [];
urls.forEach((url) => {
if (!url.startsWith("|")) {
if (canParseUrl(url)) {
output.push({
url,
type: "proxy",
});
return;
}
}
const match = url.match(/^|([^|])+|(.*)$/g);
if (!match || !match[2]) return;
if (!canParseUrl(match[2])) return;
const params = parseParams(match[1]);
const type = params.type ?? "proxy";
if (!isParsedUrlType(type)) return;
output.push({
url: match[2],
type,
});
});
return output;
}
export function getProxyUrls() {
return getParsedUrls()
.filter((v) => v.type === "proxy")
.map((v) => v.url);
}
export function getProviderApiUrls() {
return getParsedUrls()
.filter((v) => v.type === "api")
.map((v) => v.url);
}