2024-10-23 02:11:40 -06:00
|
|
|
import { getTableRows, parseOutFile } from "./fileworker.js";
|
|
|
|
|
import { Piscina, FixedQueue } from "piscina";
|
|
|
|
|
import { resolve } from "path";
|
|
|
|
|
import debugPrint from "./debugprint.js";
|
2025-01-28 20:14:19 -03:00
|
|
|
import { File } from './models/index.js';
|
|
|
|
|
import { bulkIndexFiles } from './services/elasticsearch.js';
|
2025-05-18 07:11:37 -06:00
|
|
|
import { optimizeDatabaseKws } from "./dboptimize.js";
|
2025-05-24 02:40:43 -06:00
|
|
|
import { Timer } from "./time.js";
|
2024-10-15 20:24:34 -06:00
|
|
|
|
2024-10-20 09:41:11 -06:00
|
|
|
let piscina = new Piscina({
|
2024-10-23 02:11:40 -06:00
|
|
|
filename: resolve("./lib", "fileworker.js"),
|
|
|
|
|
taskQueue: new FixedQueue(),
|
|
|
|
|
});
|
2024-10-15 20:24:34 -06:00
|
|
|
|
2025-01-28 20:14:19 -03:00
|
|
|
const BATCH_SIZE = 1000; // Process files in batches for better performance
|
|
|
|
|
|
2024-10-23 02:11:40 -06:00
|
|
|
export default async function getAllFiles(catList) {
|
2025-05-24 02:40:43 -06:00
|
|
|
var proctime = new Timer()
|
2024-10-23 02:11:40 -06:00
|
|
|
const url = "https://myrient.erista.me/files/";
|
|
|
|
|
let parentRows = await getTableRows({ url: url, base: "" });
|
|
|
|
|
let parents = [];
|
|
|
|
|
for (let x = 0; x < parentRows.html.length; x++) {
|
|
|
|
|
parents.push(
|
|
|
|
|
await parseOutFile({
|
|
|
|
|
file: parentRows.html[x],
|
|
|
|
|
base: "",
|
|
|
|
|
url: url,
|
|
|
|
|
catList: catList,
|
|
|
|
|
})
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
let dirWork = splitFilesAndFolders(parents);
|
2025-05-18 07:11:37 -06:00
|
|
|
// First run should only have directories. Is there a reason this could change in the future?
|
2024-10-23 02:11:40 -06:00
|
|
|
let dirs = dirWork.directories;
|
|
|
|
|
let fetchTasks = [];
|
|
|
|
|
let resolvedFetchTasks = [];
|
|
|
|
|
let parseTasks = [];
|
2025-01-28 20:14:19 -03:00
|
|
|
let fileCount = 0;
|
|
|
|
|
let currentBatch = [];
|
|
|
|
|
|
2024-10-23 02:11:40 -06:00
|
|
|
while (
|
|
|
|
|
dirs.length > 0 ||
|
|
|
|
|
fetchTasks.length > 0 ||
|
|
|
|
|
parseTasks.length > 0 ||
|
|
|
|
|
resolvedFetchTasks.length > 0
|
|
|
|
|
) {
|
|
|
|
|
let dirStatus = "";
|
|
|
|
|
if (dirs.length > 0) {
|
|
|
|
|
debugPrint(`Queueing: ${dirs[0].name}`);
|
|
|
|
|
fetchTasks.push(
|
|
|
|
|
piscina
|
|
|
|
|
.run(
|
|
|
|
|
{ url: dirs[0].path, base: dirs[0].name },
|
|
|
|
|
{ name: "getTableRows" }
|
|
|
|
|
)
|
|
|
|
|
.catch((err) => {
|
|
|
|
|
console.error(err);
|
|
|
|
|
})
|
|
|
|
|
);
|
|
|
|
|
dirs.shift();
|
2024-10-20 09:41:11 -06:00
|
|
|
}
|
2025-01-28 20:14:19 -03:00
|
|
|
|
2024-10-23 02:11:40 -06:00
|
|
|
if (
|
|
|
|
|
fetchTasks.length >= parseInt(process.env.MAX_FETCH_JOBS) ||
|
|
|
|
|
((fetchTasks.length > 0 || resolvedFetchTasks.length > 0) &&
|
|
|
|
|
parseTasks.length == 0)
|
|
|
|
|
) {
|
|
|
|
|
debugPrint(`Resolving ${fetchTasks.length} fetch tasks.`);
|
|
|
|
|
let settledTasks = await Promise.all(fetchTasks);
|
|
|
|
|
resolvedFetchTasks.push(...settledTasks);
|
|
|
|
|
while (resolvedFetchTasks.length > 0) {
|
2025-01-28 20:14:19 -03:00
|
|
|
if (piscina.queueSize >= parseInt(process.env.MAX_JOB_QUEUE)) {
|
2024-10-23 02:11:40 -06:00
|
|
|
break;
|
2024-10-16 03:09:31 -06:00
|
|
|
}
|
2024-10-23 02:11:40 -06:00
|
|
|
let completedTask = resolvedFetchTasks[0];
|
|
|
|
|
if (!completedTask) {
|
|
|
|
|
console.log("Myrient crawl failed, try again later.");
|
|
|
|
|
return;
|
2024-10-16 03:09:31 -06:00
|
|
|
}
|
2024-10-23 02:11:40 -06:00
|
|
|
for (let y = 0; y < completedTask.html.length; y++) {
|
|
|
|
|
parseTasks.push(
|
|
|
|
|
piscina.run(
|
|
|
|
|
{
|
|
|
|
|
file: completedTask.html[y],
|
|
|
|
|
base: completedTask.base,
|
|
|
|
|
url: completedTask.url,
|
|
|
|
|
catList: catList,
|
|
|
|
|
},
|
|
|
|
|
{ name: "parseOutFile" }
|
|
|
|
|
)
|
|
|
|
|
);
|
2024-10-20 09:41:11 -06:00
|
|
|
}
|
2024-10-23 02:11:40 -06:00
|
|
|
resolvedFetchTasks.shift();
|
|
|
|
|
}
|
2024-10-15 03:17:44 -06:00
|
|
|
|
2025-01-28 20:14:19 -03:00
|
|
|
fetchTasks = [];
|
|
|
|
|
dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${fileCount}`;
|
2024-10-23 02:11:40 -06:00
|
|
|
}
|
2025-01-28 20:14:19 -03:00
|
|
|
|
2024-10-23 02:11:40 -06:00
|
|
|
if (dirs.length == 0 && parseTasks.length > 0) {
|
2025-01-28 20:14:19 -03:00
|
|
|
debugPrint(`Resolving ${parseTasks.length} parse tasks.`);
|
2024-10-23 02:11:40 -06:00
|
|
|
let settledTasks = await Promise.all(parseTasks);
|
|
|
|
|
let working = splitFilesAndFolders(settledTasks);
|
2025-01-28 20:14:19 -03:00
|
|
|
|
2024-10-23 02:11:40 -06:00
|
|
|
if (working.files.length > 0) {
|
2025-01-28 20:14:19 -03:00
|
|
|
// Process files in smaller chunks to avoid stack overflow
|
|
|
|
|
for (let i = 0; i < working.files.length; i++) {
|
|
|
|
|
currentBatch.push(working.files[i]);
|
|
|
|
|
if (currentBatch.length >= BATCH_SIZE) {
|
|
|
|
|
await processBatch(currentBatch);
|
|
|
|
|
fileCount += currentBatch.length;
|
|
|
|
|
currentBatch = [];
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-10-23 02:11:40 -06:00
|
|
|
}
|
2025-01-28 20:14:19 -03:00
|
|
|
|
2024-10-23 02:11:40 -06:00
|
|
|
if (working.directories.length > 0) {
|
2025-01-28 20:14:19 -03:00
|
|
|
// Process directories in chunks to avoid stack overflow
|
|
|
|
|
for (let i = 0; i < working.directories.length; i++) {
|
|
|
|
|
dirs.push(working.directories[i]);
|
|
|
|
|
}
|
2024-10-23 02:11:40 -06:00
|
|
|
}
|
2025-01-28 20:14:19 -03:00
|
|
|
|
|
|
|
|
parseTasks = [];
|
|
|
|
|
dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${fileCount}`;
|
2024-10-15 03:17:44 -06:00
|
|
|
}
|
2025-01-28 20:14:19 -03:00
|
|
|
|
2024-10-23 02:11:40 -06:00
|
|
|
if (dirStatus) {
|
|
|
|
|
if (process.env.DEBUG == "1") {
|
|
|
|
|
console.log(dirStatus);
|
|
|
|
|
} else {
|
|
|
|
|
singleLineStatus(dirStatus);
|
|
|
|
|
}
|
2024-10-15 03:17:44 -06:00
|
|
|
}
|
2024-10-23 02:11:40 -06:00
|
|
|
}
|
2025-01-28 20:14:19 -03:00
|
|
|
|
|
|
|
|
// Process any remaining files in the last batch
|
|
|
|
|
if (currentBatch.length > 0) {
|
|
|
|
|
await processBatch(currentBatch);
|
|
|
|
|
fileCount += currentBatch.length;
|
2024-10-23 02:11:40 -06:00
|
|
|
}
|
2025-01-28 20:14:19 -03:00
|
|
|
|
2025-05-18 07:11:37 -06:00
|
|
|
console.log(`\nFinished crawling Myrient in ${proctime.elapsed()}.`);
|
2024-10-23 02:11:40 -06:00
|
|
|
await piscina.close();
|
2025-05-18 07:11:37 -06:00
|
|
|
await optimizeDatabaseKws();
|
2025-01-28 20:14:19 -03:00
|
|
|
return fileCount;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function processBatch(files) {
|
|
|
|
|
try {
|
|
|
|
|
// Process in small chunks to avoid memory issues
|
|
|
|
|
const chunkSize = 1000;
|
|
|
|
|
for (let i = 0; i < files.length; i += chunkSize) {
|
|
|
|
|
const chunk = files.slice(i, i + chunkSize);
|
|
|
|
|
const dbFiles = await File.bulkCreate(
|
|
|
|
|
chunk.map(file => ({
|
|
|
|
|
filename: file.filename,
|
|
|
|
|
path: file.path,
|
|
|
|
|
size: file.size,
|
|
|
|
|
category: file.category,
|
|
|
|
|
type: file.type,
|
|
|
|
|
date: file.date,
|
|
|
|
|
region: file.region,
|
2025-05-24 02:40:43 -06:00
|
|
|
group: file.group,
|
|
|
|
|
nongame: file.nongame
|
2025-01-28 20:14:19 -03:00
|
|
|
})),
|
|
|
|
|
{
|
|
|
|
|
returning: true,
|
|
|
|
|
updateOnDuplicate: ['path']
|
|
|
|
|
}
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Index chunk in Elasticsearch
|
|
|
|
|
await bulkIndexFiles(dbFiles);
|
|
|
|
|
debugPrint(`Processed ${i + chunk.length} of ${files.length} files in current batch`);
|
|
|
|
|
}
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error('Error processing batch:', error);
|
|
|
|
|
}
|
2024-10-15 20:24:34 -06:00
|
|
|
}
|
2024-10-16 03:09:31 -06:00
|
|
|
|
2024-10-23 02:11:40 -06:00
|
|
|
function splitFilesAndFolders(dirArray) {
|
|
|
|
|
let directories = [];
|
|
|
|
|
let files = [];
|
|
|
|
|
//first item is always the parent directory
|
|
|
|
|
for (let x = 1; x < dirArray.length; x++) {
|
|
|
|
|
if (typeof dirArray[x] == "undefined") continue;
|
|
|
|
|
if (dirArray[x].size == "-") {
|
|
|
|
|
directories.push(dirArray[x]);
|
|
|
|
|
} else {
|
|
|
|
|
files.push(dirArray[x]);
|
2024-10-16 03:09:31 -06:00
|
|
|
}
|
2024-10-23 02:11:40 -06:00
|
|
|
}
|
|
|
|
|
return {
|
|
|
|
|
directories: directories,
|
|
|
|
|
files: files,
|
|
|
|
|
};
|
2024-10-17 00:12:16 -06:00
|
|
|
}
|
|
|
|
|
|
2024-10-23 02:11:40 -06:00
|
|
|
function singleLineStatus(str) {
|
|
|
|
|
if (process.stdout.isTTY) {
|
|
|
|
|
process.stdout.clearLine(0);
|
|
|
|
|
process.stdout.cursorTo(0);
|
|
|
|
|
process.stdout.write(str);
|
|
|
|
|
} else {
|
|
|
|
|
console.log(str);
|
|
|
|
|
}
|
2025-05-18 07:11:37 -06:00
|
|
|
}
|