diff --git a/.env b/.env index c347579..39be939 100644 --- a/.env +++ b/.env @@ -1,5 +1,11 @@ PORT=8062 BIND_ADDRESS=0.0.0.0 -FORCE_FILE_REBUILD=0 +FORCE_FILE_REBUILD=1 DEBUG=0 -NODE_ENV=production \ No newline at end of file +NODE_ENV=production +# Memory Impacting Settings - Trades for threading efficiency. Much slower, but should be useful for limited memory environments like VPS +# May also decrease 504 failure rates +# Changes the maximum number of jobs the crawler can queue. Setting it too high will cause a call stack overflow +MAX_JOB_QUEUE=1000 +# Changes the maximum number of pages that can be fetched for parsing. Has a massive impact on memory usage. Setting to 12 results in about 1.1GiB memory usage +MAX_FETCH_JOBS=1000 \ No newline at end of file diff --git a/README.md b/README.md index fbea5a7..58035ef 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,8 @@ It is finally here. There is now a way to search all of Myrient's offerings. # Requirements 1. nodejs 2. npm -3. Requires at least 2.5GB worth of memory to complete the crawl +3. Requires at least 1.1GB worth of memory to complete the crawl +4. Requires roughly 2.1GB worth of memory to complete indexing. Once done idle memory usage is about 1.1GB. Consider using swap in a low memory environment. # Self-Hosting diff --git a/lib/dircrawl.js b/lib/dircrawl.js index 892354c..3ec30e1 100644 --- a/lib/dircrawl.js +++ b/lib/dircrawl.js @@ -1,133 +1,161 @@ -import { getTableRows, parseOutFile } from './fileworker.js' -import {Piscina, FixedQueue} from 'piscina' -import { resolve } from 'path' -import debugPrint from './debugprint.js'; +import { getTableRows, parseOutFile } from "./fileworker.js"; +import { Piscina, FixedQueue } from "piscina"; +import { resolve } from "path"; +import debugPrint from "./debugprint.js"; let piscina = new Piscina({ - filename: resolve('./lib', "fileworker.js"), - taskQueue: new FixedQueue(), - }); + filename: resolve("./lib", "fileworker.js"), + taskQueue: new FixedQueue(), +}); -export default async function getAllFiles(catList){ - var startTime = process.hrtime(); - const url = 'https://myrient.erista.me/files/' - let parentRows = await getTableRows({url: url, base: ''}) - let parents = [] - for(let x = 0; x < parentRows.html.length; x++){ - parents.push(await parseOutFile({file: parentRows.html[x], base: '', url: url, catList: catList})) +export default async function getAllFiles(catList) { + var startTime = process.hrtime(); + const url = "https://myrient.erista.me/files/"; + let parentRows = await getTableRows({ url: url, base: "" }); + let parents = []; + for (let x = 0; x < parentRows.html.length; x++) { + parents.push( + await parseOutFile({ + file: parentRows.html[x], + base: "", + url: url, + catList: catList, + }) + ); + } + let dirWork = splitFilesAndFolders(parents); + let files = dirWork.files; + let dirs = dirWork.directories; + let fetchTasks = []; + let resolvedFetchTasks = []; + let parseTasks = []; + while ( + dirs.length > 0 || + fetchTasks.length > 0 || + parseTasks.length > 0 || + resolvedFetchTasks.length > 0 + ) { + let dirStatus = ""; + if (dirs.length > 0) { + debugPrint(`Queueing: ${dirs[0].name}`); + //add tasks + fetchTasks.push( + piscina + .run( + { url: dirs[0].path, base: dirs[0].name }, + { name: "getTableRows" } + ) + .catch((err) => { + console.error(err); + }) + ); + dirs.shift(); } - let dirWork = splitFilesAndFolders(parents) - let files = dirWork.files - let dirs = dirWork.directories - let fetchTasks = [] - let resolvedFetchTasks = [] - let parseTasks = [] - while(dirs.length > 0 || fetchTasks.length > 0 || parseTasks.length > 0 || resolvedFetchTasks.length > 0) { - let dirStatus = '' - if(dirs.length > 0) { - debugPrint(`Queueing: ${dirs[0].name}`) - //add tasks - fetchTasks.push(piscina.run({url: dirs[0].path, base: dirs[0].name}, {name: 'getTableRows'}) - .catch(err => {console.error(err)}) - ) - dirs.shift() + //push completed fetch tasks to parse + if ( + fetchTasks.length >= parseInt(process.env.MAX_FETCH_JOBS) || + ((fetchTasks.length > 0 || resolvedFetchTasks.length > 0) && + parseTasks.length == 0) + ) { + debugPrint(`Resolving ${fetchTasks.length} fetch tasks.`); + let settledTasks = await Promise.all(fetchTasks); + resolvedFetchTasks.push(...settledTasks); + while (resolvedFetchTasks.length > 0) { + if (piscina.queueSize >=parseInt(process.env.MAX_JOB_QUEUE)) { + //jump out if we have a ton of tasks scheduled. + break; } - //push completed fetch tasks to parse - if(dirs.length == 0 && (fetchTasks.length > 0 || resolvedFetchTasks.length > 0)){ - debugPrint(`Resolving ${fetchTasks.length} fetch tasks.`) - let settledTasks = await Promise.all(fetchTasks) - resolvedFetchTasks.push(...settledTasks) - while(resolvedFetchTasks.length > 0){ - if(piscina.queueSize >= 1000) { //jump out if we have a ton of tasks scheduled. - break; - } - let completedTask = resolvedFetchTasks[0] - if(!completedTask) { - console.log("Myrient crawl failed, try again later.") - return - } - for(let y = 0; y < completedTask.html.length; y++){ - parseTasks.push(piscina.run({ - file: completedTask.html[y], - base: completedTask.base, - url: completedTask.url, - catList: catList }, - { name: 'parseOutFile'} - )) - } - resolvedFetchTasks.shift() - } - - fetchTasks = [] //purge - dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${files.length}` + let completedTask = resolvedFetchTasks[0]; + if (!completedTask) { + console.log("Myrient crawl failed, try again later."); + return; } - //resolve parse tasks to go through fetch tasks - if(dirs.length == 0 && parseTasks.length > 0){ - if(process.env.DEBUG == '1'){ - console.log(`Resolving ${parseTasks.length} parse tasks.`) - } - let settledTasks = await Promise.all(parseTasks) - let working = splitFilesAndFolders(settledTasks) - if(working.files.length > 0) {files.push(...working.files)} - if(working.directories.length > 0) {dirs.push(...working.directories)} - parseTasks = [] //purge - dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${files.length}` - } - if(dirStatus){ - if(process.env.DEBUG == '1'){ - console.log(dirStatus) - } - else{ - singleLineStatus(dirStatus) - } + for (let y = 0; y < completedTask.html.length; y++) { + parseTasks.push( + piscina.run( + { + file: completedTask.html[y], + base: completedTask.base, + url: completedTask.url, + catList: catList, + }, + { name: "parseOutFile" } + ) + ); } + resolvedFetchTasks.shift(); + } + + fetchTasks = []; //purge + dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${files.length}`; } - //add IDs after and strip full file name - let id = 0; - for(let file in files){ - files[file].id = id++ - delete files[file].name + //resolve parse tasks to go through fetch tasks + if (dirs.length == 0 && parseTasks.length > 0) { + if (process.env.DEBUG == "1") { + console.log(`Resolving ${parseTasks.length} parse tasks.`); + } + let settledTasks = await Promise.all(parseTasks); + let working = splitFilesAndFolders(settledTasks); + if (working.files.length > 0) { + files.push(...working.files); + } + if (working.directories.length > 0) { + dirs.push(...working.directories); + } + parseTasks = []; //purge + dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${files.length}`; } - var elapsed = parseHrtimeToSeconds(process.hrtime(startTime)); - var m = Math.floor(elapsed / 60) - var s = Math.floor(elapsed % 60) - console.log(`\nFinished crawling Myrient in ${m}m${s}s.`) - await piscina.close() - return files + if (dirStatus) { + if (process.env.DEBUG == "1") { + console.log(dirStatus); + } else { + singleLineStatus(dirStatus); + } + } + } + //add IDs after and strip full file name + let id = 0; + for (let file in files) { + files[file].id = id++; + delete files[file].name; + } + var elapsed = parseHrtimeToSeconds(process.hrtime(startTime)); + var m = Math.floor(elapsed / 60); + var s = Math.floor(elapsed % 60); + console.log(`\nFinished crawling Myrient in ${m}m${s}s.`); + await piscina.close(); + return files; } -function splitFilesAndFolders(dirArray){ - let directories = [] - let files = [] - //first item is always the parent directory - for(let x = 1; x < dirArray.length; x++){ - if(typeof dirArray[x] == 'undefined') continue - if(dirArray[x].size == '-'){ - directories.push(dirArray[x]) - } - else{ - files.push(dirArray[x]) - } - } - return{ - directories: directories, - files: files +function splitFilesAndFolders(dirArray) { + let directories = []; + let files = []; + //first item is always the parent directory + for (let x = 1; x < dirArray.length; x++) { + if (typeof dirArray[x] == "undefined") continue; + if (dirArray[x].size == "-") { + directories.push(dirArray[x]); + } else { + files.push(dirArray[x]); } + } + return { + directories: directories, + files: files, + }; } -function singleLineStatus(str){ - if(process.stdout.isTTY){ - process.stdout.clearLine(0) - process.stdout.cursorTo(0) - process.stdout.write(str) - } - else{ - console.log(str) - } +function singleLineStatus(str) { + if (process.stdout.isTTY) { + process.stdout.clearLine(0); + process.stdout.cursorTo(0); + process.stdout.write(str); + } else { + console.log(str); + } } -function parseHrtimeToSeconds(hrtime){ - var seconds = (hrtime[0] + (hrtime[1] / 1e9)).toFixed(3); - return seconds; +function parseHrtimeToSeconds(hrtime) { + var seconds = (hrtime[0] + hrtime[1] / 1e9).toFixed(3); + return seconds; } - diff --git a/lib/fileworker.js b/lib/fileworker.js index 81b3731..7a0b11e 100644 --- a/lib/fileworker.js +++ b/lib/fileworker.js @@ -25,7 +25,7 @@ export async function getTableRows(data) { }) } catch(error){ - console.error(`\nFetch failed for ${data.url}, retries remaining: ${retryLeft}`, error) + console.error(`\nFetch failed for ${data.url}, retries remaining: ${retryLeft}\n`, error) await sleep(delayMs) } finally{ diff --git a/server.js b/server.js index 33abb03..1247d19 100644 --- a/server.js +++ b/server.js @@ -195,4 +195,4 @@ server.on("listening", function () { }); console.log(`Loaded ${fileCount} known files.`); -cron.schedule("0 0 0 * * *", getFilesJob); +cron.schedule("0 30 2 * * *", getFilesJob);