Added settings to allow for memory efficiency

This commit is contained in:
Alexandra
2024-10-23 02:11:40 -06:00
parent 840ff31431
commit f58eca20c7
5 changed files with 156 additions and 121 deletions

10
.env
View File

@@ -1,5 +1,11 @@
PORT=8062
BIND_ADDRESS=0.0.0.0
FORCE_FILE_REBUILD=0
FORCE_FILE_REBUILD=1
DEBUG=0
NODE_ENV=production
NODE_ENV=production
# Memory Impacting Settings - Trades for threading efficiency. Much slower, but should be useful for limited memory environments like VPS
# May also decrease 504 failure rates
# Changes the maximum number of jobs the crawler can queue. Setting it too high will cause a call stack overflow
MAX_JOB_QUEUE=1000
# Changes the maximum number of pages that can be fetched for parsing. Has a massive impact on memory usage. Setting to 12 results in about 1.1GiB memory usage
MAX_FETCH_JOBS=1000

View File

@@ -4,7 +4,8 @@ It is finally here. There is now a way to search all of Myrient's offerings.
# Requirements
1. nodejs
2. npm
3. Requires at least 2.5GB worth of memory to complete the crawl
3. Requires at least 1.1GB worth of memory to complete the crawl
4. Requires roughly 2.1GB worth of memory to complete indexing. Once done idle memory usage is about 1.1GB. Consider using swap in a low memory environment.
# Self-Hosting

View File

@@ -1,133 +1,161 @@
import { getTableRows, parseOutFile } from './fileworker.js'
import {Piscina, FixedQueue} from 'piscina'
import { resolve } from 'path'
import debugPrint from './debugprint.js';
import { getTableRows, parseOutFile } from "./fileworker.js";
import { Piscina, FixedQueue } from "piscina";
import { resolve } from "path";
import debugPrint from "./debugprint.js";
let piscina = new Piscina({
filename: resolve('./lib', "fileworker.js"),
taskQueue: new FixedQueue(),
});
filename: resolve("./lib", "fileworker.js"),
taskQueue: new FixedQueue(),
});
export default async function getAllFiles(catList){
var startTime = process.hrtime();
const url = 'https://myrient.erista.me/files/'
let parentRows = await getTableRows({url: url, base: ''})
let parents = []
for(let x = 0; x < parentRows.html.length; x++){
parents.push(await parseOutFile({file: parentRows.html[x], base: '', url: url, catList: catList}))
export default async function getAllFiles(catList) {
var startTime = process.hrtime();
const url = "https://myrient.erista.me/files/";
let parentRows = await getTableRows({ url: url, base: "" });
let parents = [];
for (let x = 0; x < parentRows.html.length; x++) {
parents.push(
await parseOutFile({
file: parentRows.html[x],
base: "",
url: url,
catList: catList,
})
);
}
let dirWork = splitFilesAndFolders(parents);
let files = dirWork.files;
let dirs = dirWork.directories;
let fetchTasks = [];
let resolvedFetchTasks = [];
let parseTasks = [];
while (
dirs.length > 0 ||
fetchTasks.length > 0 ||
parseTasks.length > 0 ||
resolvedFetchTasks.length > 0
) {
let dirStatus = "";
if (dirs.length > 0) {
debugPrint(`Queueing: ${dirs[0].name}`);
//add tasks
fetchTasks.push(
piscina
.run(
{ url: dirs[0].path, base: dirs[0].name },
{ name: "getTableRows" }
)
.catch((err) => {
console.error(err);
})
);
dirs.shift();
}
let dirWork = splitFilesAndFolders(parents)
let files = dirWork.files
let dirs = dirWork.directories
let fetchTasks = []
let resolvedFetchTasks = []
let parseTasks = []
while(dirs.length > 0 || fetchTasks.length > 0 || parseTasks.length > 0 || resolvedFetchTasks.length > 0) {
let dirStatus = ''
if(dirs.length > 0) {
debugPrint(`Queueing: ${dirs[0].name}`)
//add tasks
fetchTasks.push(piscina.run({url: dirs[0].path, base: dirs[0].name}, {name: 'getTableRows'})
.catch(err => {console.error(err)})
)
dirs.shift()
//push completed fetch tasks to parse
if (
fetchTasks.length >= parseInt(process.env.MAX_FETCH_JOBS) ||
((fetchTasks.length > 0 || resolvedFetchTasks.length > 0) &&
parseTasks.length == 0)
) {
debugPrint(`Resolving ${fetchTasks.length} fetch tasks.`);
let settledTasks = await Promise.all(fetchTasks);
resolvedFetchTasks.push(...settledTasks);
while (resolvedFetchTasks.length > 0) {
if (piscina.queueSize >=parseInt(process.env.MAX_JOB_QUEUE)) {
//jump out if we have a ton of tasks scheduled.
break;
}
//push completed fetch tasks to parse
if(dirs.length == 0 && (fetchTasks.length > 0 || resolvedFetchTasks.length > 0)){
debugPrint(`Resolving ${fetchTasks.length} fetch tasks.`)
let settledTasks = await Promise.all(fetchTasks)
resolvedFetchTasks.push(...settledTasks)
while(resolvedFetchTasks.length > 0){
if(piscina.queueSize >= 1000) { //jump out if we have a ton of tasks scheduled.
break;
}
let completedTask = resolvedFetchTasks[0]
if(!completedTask) {
console.log("Myrient crawl failed, try again later.")
return
}
for(let y = 0; y < completedTask.html.length; y++){
parseTasks.push(piscina.run({
file: completedTask.html[y],
base: completedTask.base,
url: completedTask.url,
catList: catList },
{ name: 'parseOutFile'}
))
}
resolvedFetchTasks.shift()
}
fetchTasks = [] //purge
dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${files.length}`
let completedTask = resolvedFetchTasks[0];
if (!completedTask) {
console.log("Myrient crawl failed, try again later.");
return;
}
//resolve parse tasks to go through fetch tasks
if(dirs.length == 0 && parseTasks.length > 0){
if(process.env.DEBUG == '1'){
console.log(`Resolving ${parseTasks.length} parse tasks.`)
}
let settledTasks = await Promise.all(parseTasks)
let working = splitFilesAndFolders(settledTasks)
if(working.files.length > 0) {files.push(...working.files)}
if(working.directories.length > 0) {dirs.push(...working.directories)}
parseTasks = [] //purge
dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${files.length}`
}
if(dirStatus){
if(process.env.DEBUG == '1'){
console.log(dirStatus)
}
else{
singleLineStatus(dirStatus)
}
for (let y = 0; y < completedTask.html.length; y++) {
parseTasks.push(
piscina.run(
{
file: completedTask.html[y],
base: completedTask.base,
url: completedTask.url,
catList: catList,
},
{ name: "parseOutFile" }
)
);
}
resolvedFetchTasks.shift();
}
fetchTasks = []; //purge
dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${files.length}`;
}
//add IDs after and strip full file name
let id = 0;
for(let file in files){
files[file].id = id++
delete files[file].name
//resolve parse tasks to go through fetch tasks
if (dirs.length == 0 && parseTasks.length > 0) {
if (process.env.DEBUG == "1") {
console.log(`Resolving ${parseTasks.length} parse tasks.`);
}
let settledTasks = await Promise.all(parseTasks);
let working = splitFilesAndFolders(settledTasks);
if (working.files.length > 0) {
files.push(...working.files);
}
if (working.directories.length > 0) {
dirs.push(...working.directories);
}
parseTasks = []; //purge
dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${files.length}`;
}
var elapsed = parseHrtimeToSeconds(process.hrtime(startTime));
var m = Math.floor(elapsed / 60)
var s = Math.floor(elapsed % 60)
console.log(`\nFinished crawling Myrient in ${m}m${s}s.`)
await piscina.close()
return files
if (dirStatus) {
if (process.env.DEBUG == "1") {
console.log(dirStatus);
} else {
singleLineStatus(dirStatus);
}
}
}
//add IDs after and strip full file name
let id = 0;
for (let file in files) {
files[file].id = id++;
delete files[file].name;
}
var elapsed = parseHrtimeToSeconds(process.hrtime(startTime));
var m = Math.floor(elapsed / 60);
var s = Math.floor(elapsed % 60);
console.log(`\nFinished crawling Myrient in ${m}m${s}s.`);
await piscina.close();
return files;
}
function splitFilesAndFolders(dirArray){
let directories = []
let files = []
//first item is always the parent directory
for(let x = 1; x < dirArray.length; x++){
if(typeof dirArray[x] == 'undefined') continue
if(dirArray[x].size == '-'){
directories.push(dirArray[x])
}
else{
files.push(dirArray[x])
}
}
return{
directories: directories,
files: files
function splitFilesAndFolders(dirArray) {
let directories = [];
let files = [];
//first item is always the parent directory
for (let x = 1; x < dirArray.length; x++) {
if (typeof dirArray[x] == "undefined") continue;
if (dirArray[x].size == "-") {
directories.push(dirArray[x]);
} else {
files.push(dirArray[x]);
}
}
return {
directories: directories,
files: files,
};
}
function singleLineStatus(str){
if(process.stdout.isTTY){
process.stdout.clearLine(0)
process.stdout.cursorTo(0)
process.stdout.write(str)
}
else{
console.log(str)
}
function singleLineStatus(str) {
if (process.stdout.isTTY) {
process.stdout.clearLine(0);
process.stdout.cursorTo(0);
process.stdout.write(str);
} else {
console.log(str);
}
}
function parseHrtimeToSeconds(hrtime){
var seconds = (hrtime[0] + (hrtime[1] / 1e9)).toFixed(3);
return seconds;
function parseHrtimeToSeconds(hrtime) {
var seconds = (hrtime[0] + hrtime[1] / 1e9).toFixed(3);
return seconds;
}

View File

@@ -25,7 +25,7 @@ export async function getTableRows(data) {
})
}
catch(error){
console.error(`\nFetch failed for ${data.url}, retries remaining: ${retryLeft}`, error)
console.error(`\nFetch failed for ${data.url}, retries remaining: ${retryLeft}\n`, error)
await sleep(delayMs)
}
finally{

View File

@@ -195,4 +195,4 @@ server.on("listening", function () {
});
console.log(`Loaded ${fileCount} known files.`);
cron.schedule("0 0 0 * * *", getFilesJob);
cron.schedule("0 30 2 * * *", getFilesJob);