mirror of
https://github.com/alexankitty/Myrient-Search-Engine.git
synced 2026-01-15 16:33:15 -03:00
Merge pull request #8 from ovosimpatico/main
Use PostgreSQL and Elasticsearch instead of file-based storage
This commit is contained in:
11
.env
11
.env
@@ -13,3 +13,14 @@ MAX_FETCH_JOBS=1000
|
||||
INSTANCE_NAME=Myrient
|
||||
# Enable the built-in emulator
|
||||
EMULATOR_ENABLED=true
|
||||
|
||||
# Run docker-compose.dev.yml for running locally
|
||||
# Database Configuration
|
||||
POSTGRES_HOST=localhost
|
||||
POSTGRES_PORT=5432
|
||||
POSTGRES_DB=myrient
|
||||
POSTGRES_USER=postgres
|
||||
POSTGRES_PASSWORD=development
|
||||
|
||||
# Elasticsearch Configuration
|
||||
ELASTICSEARCH_URL=http://localhost:9200
|
||||
|
||||
13
README.md
13
README.md
@@ -2,15 +2,14 @@
|
||||
It is finally here. There is now a way to search all of Myrient's offerings.
|
||||
[Myrient Search](https://myrient.mahou.one) can be accessed by clicking the link.
|
||||
# Resource Requirements
|
||||
- Requires at least 1.1GB worth of memory to complete the crawl
|
||||
- Requires roughly 2.1GB worth of memory to complete indexing. Once done idle memory usage is about 1.1GB. Consider using swap in a low memory environment.
|
||||
- 1.5GB-ish of memory for the initial crawl (can be reduced by tweaking environment variables at the cost of slower indexing)
|
||||
- 800MB-ish of memory for running the server
|
||||
|
||||
# Self-Hosting
|
||||
|
||||
## Docker Method (Recommended)
|
||||
### Requirements
|
||||
- Docker
|
||||
- Docker Compose
|
||||
- Docker / Docker Compose
|
||||
|
||||
### Instructions
|
||||
1. Download the `docker-compose.yml` file
|
||||
@@ -20,11 +19,15 @@ It is finally here. There is now a way to search all of Myrient's offerings.
|
||||
### Requirements
|
||||
- nodejs
|
||||
- npm
|
||||
- PostgreSQL
|
||||
- Elasticsearch
|
||||
- Docker (optional)
|
||||
|
||||
### Instructions
|
||||
1. Clone the repository. `git clone https://github.com/alexankitty/Myrient-Search-Engine`
|
||||
2. Install dependencies. `npm i`
|
||||
3. Start the server. `node server.js`
|
||||
3. Run your PostgreSQL and Elasticsearch instances. A docker compose file is provided in the repository for convenience.
|
||||
4. Start the server. `node server.js`
|
||||
|
||||
# HTTPS Encryption
|
||||
Use something like `nginx` and add a site to sites-available called myrient-search in `/etc/ngix/sites-available`.
|
||||
|
||||
31
docker-compose.dev.yml
Normal file
31
docker-compose.dev.yml
Normal file
@@ -0,0 +1,31 @@
|
||||
# Run this for development
|
||||
services:
|
||||
elasticsearch:
|
||||
image: elasticsearch:8.17.1
|
||||
environment:
|
||||
- discovery.type=single-node
|
||||
- xpack.security.enabled=false
|
||||
- "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 1GB
|
||||
ports:
|
||||
- "9200:9200"
|
||||
volumes:
|
||||
- elasticsearch_data:/usr/share/elasticsearch/data
|
||||
|
||||
postgres:
|
||||
image: postgres:16-alpine
|
||||
environment:
|
||||
POSTGRES_USER: postgres
|
||||
POSTGRES_PASSWORD: development
|
||||
POSTGRES_DB: myrient
|
||||
ports:
|
||||
- "5432:5432"
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
|
||||
volumes:
|
||||
elasticsearch_data:
|
||||
postgres_data:
|
||||
@@ -1,5 +1,3 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
myrient-search:
|
||||
image: ghcr.io/alexankitty/myrient-search-engine:latest
|
||||
@@ -15,6 +13,57 @@ services:
|
||||
- MAX_FETCH_JOBS=1000
|
||||
- INSTANCE_NAME=Myrient
|
||||
- EMULATOR_ENABLED=true
|
||||
- POSTGRES_HOST=postgres
|
||||
- POSTGRES_PORT=5432
|
||||
- POSTGRES_DB=myrient
|
||||
- POSTGRES_USER=postgres
|
||||
- POSTGRES_PASSWORD=this-is-a-secure-db-password
|
||||
- ELASTICSEARCH_URL=http://elasticsearch:9200
|
||||
volumes:
|
||||
- ./data:/usr/src/app/data
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
elasticsearch:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
|
||||
elasticsearch:
|
||||
image: elasticsearch:8.17.1
|
||||
environment:
|
||||
- discovery.type=single-node
|
||||
- xpack.security.enabled=false
|
||||
- "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -s http://localhost:9200/_cluster/health | grep -q '\"status\":\"green\"\\|\"status\":\"yellow\"'"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
start_period: 40s
|
||||
deploy: # Remove if you have a lot of free memory
|
||||
resources:
|
||||
limits:
|
||||
memory: 1GB
|
||||
volumes:
|
||||
- elasticsearch_data:/usr/share/elasticsearch/data
|
||||
restart: unless-stopped
|
||||
|
||||
postgres:
|
||||
image: postgres:16-alpine
|
||||
environment:
|
||||
POSTGRES_USER: postgres
|
||||
POSTGRES_PASSWORD: this-is-a-secure-db-password
|
||||
POSTGRES_DB: myrient
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U postgres"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 10s
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
elasticsearch_data:
|
||||
postgres_data:
|
||||
76
lib/database.js
Normal file
76
lib/database.js
Normal file
@@ -0,0 +1,76 @@
|
||||
import { Sequelize } from 'sequelize';
|
||||
import 'dotenv/config';
|
||||
|
||||
// Import models
|
||||
import defineFile from './models/file.js';
|
||||
import defineQueryCount from './models/queryCount.js';
|
||||
|
||||
const sequelize = new Sequelize(process.env.POSTGRES_DB, process.env.POSTGRES_USER, process.env.POSTGRES_PASSWORD, {
|
||||
host: process.env.POSTGRES_HOST || 'localhost',
|
||||
port: process.env.POSTGRES_PORT || 5432,
|
||||
dialect: 'postgres',
|
||||
logging: process.env.DEBUG === '1' ? console.log : false
|
||||
});
|
||||
|
||||
// Initialize models
|
||||
export const File = defineFile(sequelize);
|
||||
export const QueryCount = defineQueryCount(sequelize);
|
||||
|
||||
export async function initDB() {
|
||||
try {
|
||||
// First try to connect to postgres directly to create database if needed
|
||||
const rootSequelize = new Sequelize('postgres', process.env.POSTGRES_USER, process.env.POSTGRES_PASSWORD, {
|
||||
host: process.env.POSTGRES_HOST || 'localhost',
|
||||
port: process.env.POSTGRES_PORT || 5432,
|
||||
dialect: 'postgres',
|
||||
logging: false
|
||||
});
|
||||
|
||||
try {
|
||||
// Try to create database if it doesn't exist
|
||||
await rootSequelize.query(`CREATE DATABASE ${process.env.POSTGRES_DB};`);
|
||||
console.log('Database did not exist, created.');
|
||||
} catch (err) {
|
||||
// Ignore error if database already exists
|
||||
if (!err.message.includes('already exists')) {
|
||||
throw err;
|
||||
}
|
||||
} finally {
|
||||
await rootSequelize.close();
|
||||
}
|
||||
|
||||
// Now connect to the actual database
|
||||
await sequelize.authenticate();
|
||||
console.log('DB connected.');
|
||||
|
||||
// Get current database schema
|
||||
const queryInterface = sequelize.getQueryInterface();
|
||||
const tables = await queryInterface.showAllTables();
|
||||
|
||||
if (!tables.includes('Files') || !tables.includes('QueryCounts')) {
|
||||
// If tables don't exist, create them
|
||||
console.log('DB doesn\'t exist, creating initial database schema...');
|
||||
await sequelize.sync();
|
||||
console.log('Database schema created.');
|
||||
|
||||
// Initialize QueryCount if it's a new installation
|
||||
await QueryCount.create({ count: 0 });
|
||||
} else {
|
||||
// Auto-migrate existing schema
|
||||
console.log('Checking for DB migrations...');
|
||||
await sequelize.sync({ alter: true });
|
||||
console.log('DB migrations completed.');
|
||||
}
|
||||
|
||||
// Only force sync if explicitly requested
|
||||
if (process.env.FORCE_FILE_REBUILD === '1') {
|
||||
await sequelize.sync({ force: true });
|
||||
console.log('DB forcefully synchronized.');
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Unable to connect to the DB:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
export default sequelize;
|
||||
@@ -2,12 +2,16 @@ import { getTableRows, parseOutFile } from "./fileworker.js";
|
||||
import { Piscina, FixedQueue } from "piscina";
|
||||
import { resolve } from "path";
|
||||
import debugPrint from "./debugprint.js";
|
||||
import { File } from './models/index.js';
|
||||
import { bulkIndexFiles } from './services/elasticsearch.js';
|
||||
|
||||
let piscina = new Piscina({
|
||||
filename: resolve("./lib", "fileworker.js"),
|
||||
taskQueue: new FixedQueue(),
|
||||
});
|
||||
|
||||
const BATCH_SIZE = 1000; // Process files in batches for better performance
|
||||
|
||||
export default async function getAllFiles(catList) {
|
||||
var startTime = process.hrtime();
|
||||
const url = "https://myrient.erista.me/files/";
|
||||
@@ -29,6 +33,9 @@ export default async function getAllFiles(catList) {
|
||||
let fetchTasks = [];
|
||||
let resolvedFetchTasks = [];
|
||||
let parseTasks = [];
|
||||
let fileCount = 0;
|
||||
let currentBatch = [];
|
||||
|
||||
while (
|
||||
dirs.length > 0 ||
|
||||
fetchTasks.length > 0 ||
|
||||
@@ -38,7 +45,6 @@ export default async function getAllFiles(catList) {
|
||||
let dirStatus = "";
|
||||
if (dirs.length > 0) {
|
||||
debugPrint(`Queueing: ${dirs[0].name}`);
|
||||
//add tasks
|
||||
fetchTasks.push(
|
||||
piscina
|
||||
.run(
|
||||
@@ -51,7 +57,7 @@ export default async function getAllFiles(catList) {
|
||||
);
|
||||
dirs.shift();
|
||||
}
|
||||
//push completed fetch tasks to parse
|
||||
|
||||
if (
|
||||
fetchTasks.length >= parseInt(process.env.MAX_FETCH_JOBS) ||
|
||||
((fetchTasks.length > 0 || resolvedFetchTasks.length > 0) &&
|
||||
@@ -61,8 +67,7 @@ export default async function getAllFiles(catList) {
|
||||
let settledTasks = await Promise.all(fetchTasks);
|
||||
resolvedFetchTasks.push(...settledTasks);
|
||||
while (resolvedFetchTasks.length > 0) {
|
||||
if (piscina.queueSize >=parseInt(process.env.MAX_JOB_QUEUE)) {
|
||||
//jump out if we have a ton of tasks scheduled.
|
||||
if (piscina.queueSize >= parseInt(process.env.MAX_JOB_QUEUE)) {
|
||||
break;
|
||||
}
|
||||
let completedTask = resolvedFetchTasks[0];
|
||||
@@ -86,25 +91,38 @@ export default async function getAllFiles(catList) {
|
||||
resolvedFetchTasks.shift();
|
||||
}
|
||||
|
||||
fetchTasks = []; //purge
|
||||
dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${files.length}`;
|
||||
fetchTasks = [];
|
||||
dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${fileCount}`;
|
||||
}
|
||||
//resolve parse tasks to go through fetch tasks
|
||||
|
||||
if (dirs.length == 0 && parseTasks.length > 0) {
|
||||
if (process.env.DEBUG == "1") {
|
||||
console.log(`Resolving ${parseTasks.length} parse tasks.`);
|
||||
}
|
||||
debugPrint(`Resolving ${parseTasks.length} parse tasks.`);
|
||||
let settledTasks = await Promise.all(parseTasks);
|
||||
let working = splitFilesAndFolders(settledTasks);
|
||||
|
||||
if (working.files.length > 0) {
|
||||
files.push(...working.files);
|
||||
// Process files in smaller chunks to avoid stack overflow
|
||||
for (let i = 0; i < working.files.length; i++) {
|
||||
currentBatch.push(working.files[i]);
|
||||
if (currentBatch.length >= BATCH_SIZE) {
|
||||
await processBatch(currentBatch);
|
||||
fileCount += currentBatch.length;
|
||||
currentBatch = [];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (working.directories.length > 0) {
|
||||
dirs.push(...working.directories);
|
||||
// Process directories in chunks to avoid stack overflow
|
||||
for (let i = 0; i < working.directories.length; i++) {
|
||||
dirs.push(working.directories[i]);
|
||||
}
|
||||
}
|
||||
parseTasks = []; //purge
|
||||
dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${files.length}`;
|
||||
|
||||
parseTasks = [];
|
||||
dirStatus = `Directories Remaining: ${dirs.length}, Files Found: ${fileCount}`;
|
||||
}
|
||||
|
||||
if (dirStatus) {
|
||||
if (process.env.DEBUG == "1") {
|
||||
console.log(dirStatus);
|
||||
@@ -113,18 +131,51 @@ export default async function getAllFiles(catList) {
|
||||
}
|
||||
}
|
||||
}
|
||||
//add IDs after and strip full file name
|
||||
let id = 0;
|
||||
for (let file in files) {
|
||||
files[file].id = id++;
|
||||
delete files[file].name;
|
||||
|
||||
// Process any remaining files in the last batch
|
||||
if (currentBatch.length > 0) {
|
||||
await processBatch(currentBatch);
|
||||
fileCount += currentBatch.length;
|
||||
}
|
||||
|
||||
var elapsed = parseHrtimeToSeconds(process.hrtime(startTime));
|
||||
var m = Math.floor(elapsed / 60);
|
||||
var s = Math.floor(elapsed % 60);
|
||||
console.log(`\nFinished crawling Myrient in ${m}m${s}s.`);
|
||||
await piscina.close();
|
||||
return files;
|
||||
return fileCount;
|
||||
}
|
||||
|
||||
async function processBatch(files) {
|
||||
try {
|
||||
// Process in small chunks to avoid memory issues
|
||||
const chunkSize = 1000;
|
||||
for (let i = 0; i < files.length; i += chunkSize) {
|
||||
const chunk = files.slice(i, i + chunkSize);
|
||||
const dbFiles = await File.bulkCreate(
|
||||
chunk.map(file => ({
|
||||
filename: file.filename,
|
||||
path: file.path,
|
||||
size: file.size,
|
||||
category: file.category,
|
||||
type: file.type,
|
||||
date: file.date,
|
||||
region: file.region,
|
||||
group: file.group
|
||||
})),
|
||||
{
|
||||
returning: true,
|
||||
updateOnDuplicate: ['path']
|
||||
}
|
||||
);
|
||||
|
||||
// Index chunk in Elasticsearch
|
||||
await bulkIndexFiles(dbFiles);
|
||||
debugPrint(`Processed ${i + chunk.length} of ${files.length} files in current batch`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error processing batch:', error);
|
||||
}
|
||||
}
|
||||
|
||||
function splitFilesAndFolders(dirArray) {
|
||||
|
||||
@@ -3,7 +3,7 @@ import HTMLParse from "node-html-parser";
|
||||
|
||||
export async function getTableRows(data) {
|
||||
let retryLeft = 5;
|
||||
const delayMs = 500
|
||||
const delayMs = 500
|
||||
while (retryLeft) {
|
||||
try{
|
||||
return await fetch(data.url)
|
||||
@@ -40,6 +40,7 @@ export async function parseOutFile(data) {
|
||||
let path = file.querySelector(".link").firstChild.getAttribute("href");
|
||||
if (path == "../") return;
|
||||
let name = innertext(file.querySelector(".link").innerHTML).trim();
|
||||
if (name == "Parent directory/") return;
|
||||
let fullName = data.base + name;
|
||||
let size = innertext(file.querySelector(".size").innerHTML).trim();
|
||||
let cats = findCategory(fullName, data.catList)
|
||||
|
||||
49
lib/models/file.js
Normal file
49
lib/models/file.js
Normal file
@@ -0,0 +1,49 @@
|
||||
import { DataTypes } from 'sequelize';
|
||||
|
||||
export default function (sequelize) {
|
||||
const File = sequelize.define('File', {
|
||||
id: {
|
||||
type: DataTypes.INTEGER,
|
||||
primaryKey: true,
|
||||
autoIncrement: true
|
||||
},
|
||||
filename: {
|
||||
type: DataTypes.STRING,
|
||||
allowNull: false
|
||||
},
|
||||
path: {
|
||||
type: DataTypes.TEXT,
|
||||
allowNull: false,
|
||||
unique: true
|
||||
},
|
||||
size: {
|
||||
type: DataTypes.TEXT,
|
||||
allowNull: false
|
||||
},
|
||||
category: {
|
||||
type: DataTypes.TEXT,
|
||||
allowNull: false
|
||||
},
|
||||
type: {
|
||||
type: DataTypes.TEXT
|
||||
},
|
||||
date: {
|
||||
type: DataTypes.TEXT
|
||||
},
|
||||
region: {
|
||||
type: DataTypes.TEXT
|
||||
},
|
||||
group: {
|
||||
type: DataTypes.TEXT
|
||||
}
|
||||
}, {
|
||||
indexes: [
|
||||
{ fields: ['filename'] },
|
||||
{ fields: ['category'] },
|
||||
{ fields: ['type'] },
|
||||
{ fields: ['region'] }
|
||||
]
|
||||
});
|
||||
|
||||
return File;
|
||||
}
|
||||
1
lib/models/index.js
Normal file
1
lib/models/index.js
Normal file
@@ -0,0 +1 @@
|
||||
export { File, QueryCount } from '../database.js';
|
||||
18
lib/models/queryCount.js
Normal file
18
lib/models/queryCount.js
Normal file
@@ -0,0 +1,18 @@
|
||||
import { DataTypes } from 'sequelize';
|
||||
|
||||
export default function (sequelize) {
|
||||
const QueryCount = sequelize.define('QueryCount', {
|
||||
id: {
|
||||
type: DataTypes.INTEGER,
|
||||
primaryKey: true,
|
||||
autoIncrement: true
|
||||
},
|
||||
count: {
|
||||
type: DataTypes.INTEGER,
|
||||
defaultValue: 0,
|
||||
allowNull: false
|
||||
}
|
||||
});
|
||||
|
||||
return QueryCount;
|
||||
}
|
||||
@@ -51,6 +51,7 @@
|
||||
"update",
|
||||
"utility",
|
||||
"video",
|
||||
"Virtual Console",
|
||||
"wallpaper"
|
||||
]
|
||||
}
|
||||
212
lib/search.js
212
lib/search.js
@@ -1,196 +1,40 @@
|
||||
import MiniSearch from 'minisearch'
|
||||
import debugPrint from './debugprint.js'
|
||||
import { search as elasticSearch, getSuggestions as elasticSuggestions } from './services/elasticsearch.js'
|
||||
import { File } from './models/index.js'
|
||||
|
||||
export default class Searcher{
|
||||
constructor(fields, stringGroups){
|
||||
this.distance = parseFloat(process.env.FUZZY_DISTANCE)
|
||||
this.minMatch = parseFloat(process.env.MIN_MATCH)
|
||||
this.indexing = false
|
||||
this.stringGroups = stringGroups
|
||||
export default class Searcher {
|
||||
constructor(fields) {
|
||||
this.fields = [...fields]
|
||||
}
|
||||
|
||||
termProcessor(term){
|
||||
term = term.toLowerCase()
|
||||
let stringArray = [term]
|
||||
stringArray.push(...Searcher.stringBreakout(term))
|
||||
for(let group in searchAlikes.StringAssoc){
|
||||
let currentGroup = searchAlikes.StringAssoc[group]
|
||||
let leadString = currentGroup[0]
|
||||
if(term == leadString){
|
||||
for(let index in currentGroup){
|
||||
let currentString = currentGroup[index]
|
||||
stringArray.push(...Searcher.stringBreakout(currentString))
|
||||
}
|
||||
}
|
||||
}
|
||||
return [...new Set(stringArray)]
|
||||
}
|
||||
|
||||
static stringBreakout(string){
|
||||
let symbolRegex = /-|_|\+|=|\)|\(|\[|{|}|]|;|:|"|'|<|>|\.|,|\/|\?|\||\\|!|@|#|\$|%|\^|&|\*/g
|
||||
let array = [string]
|
||||
let workingString = ''
|
||||
array.push(string.replaceAll(symbolRegex, ''))
|
||||
array.push(...string.split(' '))
|
||||
workingString = string.replaceAll(symbolRegex, ' ')
|
||||
array.push(...workingString.split(' '))
|
||||
return [...new Set(array)]
|
||||
}
|
||||
|
||||
stringToWordArray(string){
|
||||
let symbolRegex = /-|_|\+|=|\)|\(|\[|{|}|]|;|:|"|'|<|>|\.|,|\/|\?|\||\\|!|@|#|\$|%|\^|&|\*/g
|
||||
let workingString = string.replaceAll(symbolRegex, ' ')
|
||||
let stringArray = workingString.split(' ')
|
||||
return stringArray.filter(entry => entry.trim() != '');
|
||||
}
|
||||
|
||||
async findAllMatches(query, options){
|
||||
try{
|
||||
let optionsValue = structuredClone(options)
|
||||
var startTime = process.hrtime();
|
||||
optionsValue.fields.push('hidden')
|
||||
debugPrint(options)
|
||||
let results = this.miniSearch.search(query, optionsValue)
|
||||
var elapsed = this.parseHrtimeToSeconds(process.hrtime(startTime));
|
||||
return {
|
||||
items: results,
|
||||
elapsed: elapsed
|
||||
}
|
||||
}
|
||||
catch(err){
|
||||
console.error(err)
|
||||
}
|
||||
}
|
||||
|
||||
async createIndex(fileArr){
|
||||
if(!this.miniSearch){
|
||||
this.miniSearch = new MiniSearch({
|
||||
fields: [...this.fields, 'hidden'],
|
||||
storeFields: ['filename', 'category', 'type', 'date', 'size', 'region', 'path', 'id', 'group'],
|
||||
processTerm: this.termProcessor
|
||||
})
|
||||
}
|
||||
else{
|
||||
this.miniSearch.removeAll()
|
||||
}
|
||||
this.indexing = true
|
||||
this.miniSearch.addAll(fileArr)
|
||||
console.log('File list indexing completed.')
|
||||
console.log(`Total terms in index: ${this.miniSearch.termCount}`)
|
||||
this.indexing = false
|
||||
}
|
||||
async updateIndex(fileArr){
|
||||
let fields = [...this.fields]
|
||||
fields.push('id')
|
||||
console.log('Performing Index Update.')
|
||||
for(let x = 0; x < fileArr.length; x++){
|
||||
let searchIndex = this.findIndex(x)
|
||||
if(!searchIndex){
|
||||
//add if it doesn't exist in the index
|
||||
debugPrint(`Adding index ${x}`)
|
||||
this.miniSearch.add(fileArr[x])
|
||||
continue
|
||||
}
|
||||
let changed = false
|
||||
for(let field in fields){
|
||||
let fieldName = fields[field]
|
||||
let searchField = searchIndex[fieldName]
|
||||
let fileField = fileArr[x][fieldName]
|
||||
debugPrint(`${fieldName}: ${searchField} ${fileField}`)
|
||||
if(searchField == fileField){
|
||||
changed = true
|
||||
}
|
||||
}
|
||||
if(changed){
|
||||
debugPrint(`Updating Index ${x}`)
|
||||
this.miniSearch.replace(fileArr[x])
|
||||
}
|
||||
|
||||
async findAllMatches(query, options) {
|
||||
try {
|
||||
return await elasticSearch(query, options)
|
||||
} catch (err) {
|
||||
console.error(err)
|
||||
return { items: [], elapsed: 0 }
|
||||
}
|
||||
let indexSize = this.getIndexSize()
|
||||
if(indexSize > fileArr.length){
|
||||
debugPrint(`Removing indices ${fileArr.length}-${indexSize}.`)
|
||||
//clean up indices that are no longer relevant
|
||||
for(let x = fileArr.length; x < indexSize; x++){
|
||||
this.miniSearch.discard(x)
|
||||
}
|
||||
}
|
||||
|
||||
async getSuggestions(query, options) {
|
||||
try {
|
||||
return await elasticSuggestions(query, options)
|
||||
} catch (err) {
|
||||
console.error(err)
|
||||
return []
|
||||
}
|
||||
console.log(`Completed index update. New Term Count: ${this.miniSearch.termCount}`)
|
||||
}
|
||||
parseHrtimeToSeconds(hrtime){
|
||||
var seconds = (hrtime[0] + (hrtime[1] / 1e9)).toFixed(3);
|
||||
return seconds;
|
||||
|
||||
findIndex(id) {
|
||||
return File.findByPk(id)
|
||||
}
|
||||
sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
|
||||
async getIndexSize() {
|
||||
return await File.count()
|
||||
}
|
||||
findIndex(id){
|
||||
//this might be a hack
|
||||
return this.miniSearch._storedFields.get(id)
|
||||
}
|
||||
getIndexSize(){
|
||||
return this.miniSearch._storedFields.size
|
||||
}
|
||||
async getSuggestions(query, options){
|
||||
query = query.toLowerCase()
|
||||
options.fields = ['filename', 'category'] //reduce field search
|
||||
let matches = await this.findAllMatches(query, options)
|
||||
let results = matches.items
|
||||
let suggestions = []
|
||||
for(let result = 0; result < results.length; result++){
|
||||
let currentResult = results[result]
|
||||
let fileString = String(currentResult.filename).toLowerCase()
|
||||
let categoryString = String(currentResult.category).toLowerCase()
|
||||
let fileSplit = fileString.split(query)
|
||||
let categorySplit = categoryString.split(query)
|
||||
if(fileSplit.length > 1){
|
||||
let wordSplit = this.stringToWordArray(fileSplit[1])
|
||||
let prediction = ''
|
||||
let prefixMatch = String(fileSplit[1]).substring(0,1) != ' '
|
||||
let prefixSpace = prefixMatch ? '' : ' '
|
||||
if(wordSplit.length > 1){
|
||||
prediction = `${prefixSpace}${wordSplit[0]} ${wordSplit[1]}`
|
||||
}
|
||||
else if (wordSplit.length == 1){
|
||||
prediction = `${prefixSpace}${wordSplit[0]}`
|
||||
}
|
||||
else {
|
||||
//bad result discard
|
||||
continue
|
||||
}
|
||||
suggestions.push(`${query}${prediction}`)
|
||||
continue
|
||||
}
|
||||
if(categorySplit.length > 1){
|
||||
let wordSplit = this.stringToWordArray(categorySplit[1])
|
||||
if(!wordSplit[0]){
|
||||
wordSplit.shift()
|
||||
}
|
||||
let prediction = ''
|
||||
let prefixMatch = String(categorySplit[1]).substring(0,1) != ' '
|
||||
let prefixSpace = prefixMatch ? '' : ' '
|
||||
if(wordSplit.length > 1){
|
||||
prediction = `${prefixSpace}${wordSplit[0]} ${wordSplit[1]}`
|
||||
}
|
||||
else if (wordSplit.length == 1){
|
||||
prediction = `${prefixSpace}${wordSplit[0]}`
|
||||
}
|
||||
else {
|
||||
//bad result discard
|
||||
continue
|
||||
}
|
||||
suggestions.push(`${query}${prediction}`)
|
||||
continue
|
||||
}
|
||||
}
|
||||
let dedupe = [...new Set(suggestions)]
|
||||
let dedupeLimit = dedupe.length >= 10 ? 10 : dedupe.length
|
||||
let arr = []
|
||||
for(let x = 0; x < dedupeLimit; x++){
|
||||
arr.push({
|
||||
suggestion: dedupe[x]
|
||||
})
|
||||
}
|
||||
return arr
|
||||
|
||||
get termCount() {
|
||||
return 0 // Not applicable with Elasticsearch
|
||||
}
|
||||
}
|
||||
233
lib/services/elasticsearch.js
Normal file
233
lib/services/elasticsearch.js
Normal file
@@ -0,0 +1,233 @@
|
||||
import { Client } from '@elastic/elasticsearch';
|
||||
import debugPrint from '../debugprint.js';
|
||||
import { File } from '../models/index.js';
|
||||
|
||||
const client = new Client({
|
||||
node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
|
||||
});
|
||||
|
||||
const INDEX_NAME = 'myrient_files';
|
||||
|
||||
export async function initElasticsearch() {
|
||||
try {
|
||||
const indexExists = await client.indices.exists({ index: INDEX_NAME });
|
||||
|
||||
if (!indexExists) {
|
||||
await client.indices.create({
|
||||
index: INDEX_NAME,
|
||||
body: {
|
||||
settings: {
|
||||
analysis: {
|
||||
analyzer: {
|
||||
filename_analyzer: {
|
||||
type: 'custom',
|
||||
tokenizer: 'standard',
|
||||
filter: ['lowercase', 'word_delimiter_graph']
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
mappings: {
|
||||
properties: {
|
||||
filename: {
|
||||
type: 'text',
|
||||
analyzer: 'filename_analyzer'
|
||||
},
|
||||
category: {
|
||||
type: 'text',
|
||||
analyzer: 'standard',
|
||||
fields: {
|
||||
keyword: {
|
||||
type: 'keyword'
|
||||
}
|
||||
}
|
||||
},
|
||||
type: {
|
||||
type: 'text',
|
||||
analyzer: 'standard'
|
||||
},
|
||||
region: {
|
||||
type: 'text',
|
||||
analyzer: 'standard'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
console.log('Elasticsearch index created');
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Elasticsearch init error:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
export async function indexFile(file) {
|
||||
try {
|
||||
await client.index({
|
||||
index: INDEX_NAME,
|
||||
id: file.id.toString(),
|
||||
document: file
|
||||
});
|
||||
debugPrint(`Indexed file: ${file.filename}`);
|
||||
} catch (error) {
|
||||
console.error('Error indexing file:', error);
|
||||
}
|
||||
}
|
||||
|
||||
export async function bulkIndexFiles(files) {
|
||||
const operations = files.flatMap(file => [
|
||||
{ index: { _index: INDEX_NAME, _id: file.id.toString() } },
|
||||
{
|
||||
filename: file.filename,
|
||||
category: file.category,
|
||||
type: file.type,
|
||||
region: file.region
|
||||
}
|
||||
]);
|
||||
|
||||
try {
|
||||
const { errors, items } = await client.bulk({
|
||||
refresh: true,
|
||||
operations
|
||||
});
|
||||
|
||||
if (errors) {
|
||||
console.error('Bulk indexing had errors');
|
||||
items.forEach(item => {
|
||||
if (item.index.error) {
|
||||
console.error(item.index.error);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
debugPrint(`Bulk indexed ${files.length} files`);
|
||||
} catch (error) {
|
||||
console.error('Bulk indexing error:', error);
|
||||
}
|
||||
}
|
||||
|
||||
export async function search(query, options) {
|
||||
const searchQuery = {
|
||||
index: INDEX_NAME,
|
||||
body: {
|
||||
size: 1000,
|
||||
query: {
|
||||
bool: {
|
||||
must: buildMustClauses(query, options),
|
||||
should: buildShouldClauses(query, options)
|
||||
}
|
||||
},
|
||||
highlight: {
|
||||
fields: {
|
||||
filename: {},
|
||||
category: {},
|
||||
type: {},
|
||||
region: {}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const startTime = process.hrtime();
|
||||
const response = await client.search(searchQuery);
|
||||
const elapsed = parseHrtimeToSeconds(process.hrtime(startTime));
|
||||
|
||||
// Fetch full records from PostgreSQL for the search results
|
||||
const ids = response.hits.hits.map(hit => hit._id);
|
||||
const fullRecords = await File.findAll({
|
||||
where: { id: ids }
|
||||
});
|
||||
|
||||
// Create a map of full records by id
|
||||
const recordMap = fullRecords.reduce((map, record) => {
|
||||
map[record.id] = record;
|
||||
return map;
|
||||
}, {});
|
||||
|
||||
// Combine Elasticsearch results with full PostgreSQL records
|
||||
return {
|
||||
items: response.hits.hits.map(hit => ({
|
||||
...recordMap[hit._id].dataValues,
|
||||
score: hit._score,
|
||||
highlights: hit.highlight
|
||||
})),
|
||||
elapsed
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('Search error:', error);
|
||||
return { items: [], elapsed: 0 };
|
||||
}
|
||||
}
|
||||
|
||||
function buildMustClauses(query, options) {
|
||||
const clauses = [];
|
||||
|
||||
if (options.combineWith === 'AND') {
|
||||
query.split(' ').forEach(term => {
|
||||
clauses.push({
|
||||
multi_match: {
|
||||
query: term,
|
||||
fields: options.fields.map(field =>
|
||||
field === 'filename' ? `${field}^2` : field
|
||||
),
|
||||
fuzziness: options.fuzzy || 0,
|
||||
type: 'best_fields'
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return clauses;
|
||||
}
|
||||
|
||||
function buildShouldClauses(query, options) {
|
||||
const clauses = [];
|
||||
|
||||
if (options.combineWith !== 'AND') {
|
||||
clauses.push({
|
||||
multi_match: {
|
||||
query,
|
||||
fields: options.fields.map(field =>
|
||||
field === 'filename' ? `${field}^2` : field
|
||||
),
|
||||
fuzziness: options.fuzzy || 0,
|
||||
type: 'best_fields'
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return clauses;
|
||||
}
|
||||
|
||||
function parseHrtimeToSeconds(hrtime) {
|
||||
return (hrtime[0] + (hrtime[1] / 1e9)).toFixed(3);
|
||||
}
|
||||
|
||||
export async function getSuggestions(query, options) {
|
||||
try {
|
||||
const response = await client.search({
|
||||
index: INDEX_NAME,
|
||||
body: {
|
||||
query: {
|
||||
multi_match: {
|
||||
query,
|
||||
fields: ['filename^2', 'category'],
|
||||
fuzziness: 'AUTO',
|
||||
type: 'best_fields'
|
||||
}
|
||||
},
|
||||
_source: ['filename', 'category'],
|
||||
size: 10
|
||||
}
|
||||
});
|
||||
|
||||
return response.hits.hits.map(hit => ({
|
||||
suggestion: hit._source.filename
|
||||
}));
|
||||
} catch (error) {
|
||||
console.error('Suggestion error:', error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
1741
package-lock.json
generated
1741
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
10
package.json
10
package.json
@@ -1,20 +1,24 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"@elastic/elasticsearch": "^8.12.2",
|
||||
"compression": "^1.7.4",
|
||||
"dotenv": "^16.4.5",
|
||||
"ejs": "^3.1.10",
|
||||
"express": "^4.21.1",
|
||||
"figlet": "^1.7.0",
|
||||
"file-older-than": "^1.0.0",
|
||||
"innertext": "^1.0.3",
|
||||
"jsdom": "^25.0.1",
|
||||
"minisearch": "^7.1.0",
|
||||
"jszip": "^3.10.1",
|
||||
"node-cron": "^3.0.3",
|
||||
"node-fetch": "^3.3.2",
|
||||
"node-html-parser": "^6.1.13",
|
||||
"pg": "^8.11.3",
|
||||
"pg-hstore": "^2.3.4",
|
||||
"piscina": "^4.7.0",
|
||||
"sanitize": "^2.1.2",
|
||||
"figlet": "^1.7.0",
|
||||
"jszip": "^3.10.1"
|
||||
"sequelize": "^6.37.1",
|
||||
"sequelize-cli": "^6.6.2"
|
||||
},
|
||||
"type": "module"
|
||||
}
|
||||
|
||||
113
server.js
113
server.js
@@ -2,7 +2,6 @@ import getAllFiles from "./lib/dircrawl.js";
|
||||
import FileHandler from "./lib/filehandler.js";
|
||||
import Searcher from "./lib/search.js";
|
||||
import cron from "node-cron";
|
||||
import FileOlderThan from "file-older-than";
|
||||
import "dotenv/config";
|
||||
import express from "express";
|
||||
import http from "http";
|
||||
@@ -12,9 +11,9 @@ import compression from "compression";
|
||||
import { generateAsciiArt } from './lib/asciiart.js';
|
||||
import { getEmulatorConfig, isEmulatorCompatible, isNonGameContent } from './lib/emulatorConfig.js';
|
||||
import fetch from 'node-fetch';
|
||||
import { initDB, File, QueryCount } from './lib/database.js';
|
||||
import { initElasticsearch } from './lib/services/elasticsearch.js';
|
||||
|
||||
let fileListPath = "./data/filelist.json";
|
||||
let queryCountFile = "./data/queries.txt";
|
||||
let categoryListPath = "./lib/categories.json"
|
||||
let searchAlikesPath = './lib/searchalikes.json'
|
||||
let nonGameTermsPath = './lib/nonGameTerms.json'
|
||||
@@ -25,12 +24,15 @@ let crawlTime = 0;
|
||||
let queryCount = 0;
|
||||
let fileCount = 0;
|
||||
let indexPage = "pages/index";
|
||||
if (FileHandler.fileExists(fileListPath)) {
|
||||
crawlTime = await FileHandler.fileTime(fileListPath);
|
||||
}
|
||||
if (FileHandler.fileExists(queryCountFile)) {
|
||||
queryCount = parseInt(await FileHandler.readFile(queryCountFile));
|
||||
}
|
||||
|
||||
// Initialize databases
|
||||
await initDB();
|
||||
await initElasticsearch();
|
||||
|
||||
// Get initial counts
|
||||
fileCount = await File.count();
|
||||
crawlTime = (await File.max('updatedAt'))?.getTime() || 0;
|
||||
queryCount = (await QueryCount.findOne())?.count || 0;
|
||||
|
||||
let searchFields = ["filename", "category", "type", "region"];
|
||||
|
||||
@@ -52,29 +54,16 @@ for (let field in searchFields) {
|
||||
}
|
||||
}
|
||||
|
||||
let fileList = [];
|
||||
let search; //cheat so we can check before assignment
|
||||
let search = new Searcher(searchFields);
|
||||
|
||||
async function getFilesJob() {
|
||||
console.log("Updating the file list.");
|
||||
fileList = await getAllFiles(categoryList);
|
||||
if(!fileList){
|
||||
if(typeof search == "undefined"){
|
||||
//fall back to loading the list if it exists
|
||||
await loadFileList()
|
||||
}
|
||||
return
|
||||
fileCount = await getAllFiles(categoryList);
|
||||
if(!fileCount) {
|
||||
console.log("File update failed");
|
||||
return;
|
||||
}
|
||||
await FileHandler.saveJsonFile(fileListPath, fileList);
|
||||
fileCount = fileList.length;
|
||||
if (typeof search == "undefined") {
|
||||
search = new Searcher(searchFields);
|
||||
await search.createIndex(fileList)
|
||||
} else {
|
||||
await search.updateIndex(fileList);
|
||||
}
|
||||
fileList = [];
|
||||
crawlTime = await FileHandler.fileTime(fileListPath);
|
||||
crawlTime = Date.now();
|
||||
console.log(`Finished updating file list. ${fileCount} found.`);
|
||||
}
|
||||
|
||||
@@ -82,38 +71,19 @@ function buildOptions(page, options) {
|
||||
return { page: page, ...options, ...defaultOptions };
|
||||
}
|
||||
|
||||
async function loadFileList(){
|
||||
fileList = await FileHandler.parseJsonFile(fileListPath);
|
||||
fileCount = fileList.length;
|
||||
search = new Searcher(searchFields, searchAlikes.StringGroups);
|
||||
await search.createIndex(fileList)
|
||||
fileList = [];
|
||||
}
|
||||
|
||||
if (
|
||||
process.env.FORCE_FILE_REBUILD == "1" ||
|
||||
!FileHandler.fileExists(fileListPath) ||
|
||||
FileOlderThan(fileListPath, "1w")
|
||||
) {
|
||||
await getFilesJob();
|
||||
} else {
|
||||
await loadFileList()
|
||||
}
|
||||
|
||||
let defaultOptions = {
|
||||
crawlTime: crawlTime,
|
||||
queryCount: queryCount,
|
||||
fileCount: fileCount,
|
||||
termCount: search.miniSearch.termCount,
|
||||
termCount: 0,
|
||||
generateAsciiArt: generateAsciiArt,
|
||||
isEmulatorCompatible: isEmulatorCompatible
|
||||
};
|
||||
|
||||
function updateDefaults(){
|
||||
defaultOptions.crawlTime = crawlTime
|
||||
defaultOptions.queryCount = queryCount
|
||||
defaultOptions.fileCount = fileCount
|
||||
defaultOptions.termCount = search.miniSearch.termCount
|
||||
defaultOptions.crawlTime = crawlTime;
|
||||
defaultOptions.queryCount = queryCount;
|
||||
defaultOptions.fileCount = fileCount;
|
||||
}
|
||||
|
||||
let app = express();
|
||||
@@ -154,13 +124,16 @@ app.get("/search", async function (req, res) {
|
||||
}
|
||||
}
|
||||
if (settings.combineWith != "AND") {
|
||||
delete settings.combineWith; //remove if unset to avoid crashing
|
||||
delete settings.combineWith;
|
||||
}
|
||||
let results = await search.findAllMatches(query, settings);
|
||||
debugPrint(results);
|
||||
if(results.items.length && pageNum == 1){
|
||||
queryCount += 1;
|
||||
FileHandler.writeFile(queryCountFile, String(queryCount));
|
||||
await QueryCount.update(
|
||||
{ count: queryCount },
|
||||
{ where: { id: 1 } }
|
||||
);
|
||||
updateDefaults()
|
||||
}
|
||||
let options = {
|
||||
@@ -173,26 +146,31 @@ app.get("/search", async function (req, res) {
|
||||
let page = "results";
|
||||
options = buildOptions(page, options);
|
||||
res.render(indexPage, options);
|
||||
|
||||
});
|
||||
|
||||
app.get("/lucky", async function (req, res) {
|
||||
let results = [];
|
||||
if (req.query.q) {
|
||||
let settings = req.query.s ? JSON.parse(req.query.s) : defaultSettings;
|
||||
let settings = req.query.s ? JSON.parse(atob(req.query.s)) : defaultSettings;
|
||||
results = await search.findAllMatches(req.query.q, settings);
|
||||
debugPrint(results);
|
||||
}
|
||||
if (results.length) {
|
||||
if (results.items.length) {
|
||||
res.redirect(results.items[0].path);
|
||||
} else {
|
||||
const magicNum = Math.floor(Math.random() * search.getIndexSize());
|
||||
const luckyPath = search.findIndex(magicNum).path;
|
||||
debugPrint(`${magicNum}: ${luckyPath}`);
|
||||
res.redirect(luckyPath);
|
||||
const count = await File.count();
|
||||
const randomId = Math.floor(Math.random() * count);
|
||||
const luckyFile = await File.findOne({
|
||||
offset: randomId
|
||||
});
|
||||
debugPrint(`${randomId}: ${luckyFile?.path}`);
|
||||
res.redirect(luckyFile?.path || '/');
|
||||
}
|
||||
queryCount += 1;
|
||||
FileHandler.writeFile(queryCountFile, String(queryCount));
|
||||
await QueryCount.update(
|
||||
{ count: queryCount },
|
||||
{ where: { id: 1 } }
|
||||
);
|
||||
updateDefaults()
|
||||
});
|
||||
|
||||
@@ -229,7 +207,7 @@ app.get("/play/:id", async function (req, res) {
|
||||
}
|
||||
|
||||
let fileId = parseInt(req.params.id);
|
||||
let romFile = search.findIndex(fileId);
|
||||
let romFile = await search.findIndex(fileId);
|
||||
|
||||
if (!romFile) {
|
||||
res.redirect('/');
|
||||
@@ -255,7 +233,7 @@ app.get("/proxy-rom/:id", async function (req, res) {
|
||||
}
|
||||
|
||||
let fileId = parseInt(req.params.id);
|
||||
let romFile = search.findIndex(fileId);
|
||||
let romFile = await search.findIndex(fileId);
|
||||
|
||||
if (!romFile) {
|
||||
res.status(404).send('ROM not found');
|
||||
@@ -322,4 +300,13 @@ server.on("listening", function () {
|
||||
});
|
||||
console.log(`Loaded ${fileCount} known files.`);
|
||||
|
||||
// Run file update job if needed
|
||||
if (
|
||||
process.env.FORCE_FILE_REBUILD == "1" ||
|
||||
!fileCount ||
|
||||
(crawlTime && Date.now() - crawlTime > 7 * 24 * 60 * 60 * 1000) // 1 week
|
||||
) {
|
||||
await getFilesJob();
|
||||
}
|
||||
|
||||
cron.schedule("0 30 2 * * *", getFilesJob);
|
||||
|
||||
@@ -144,7 +144,7 @@
|
||||
</div>
|
||||
<script defer>
|
||||
resultTable = new DataTable('#results', {
|
||||
"order": [[6, 'desc']],
|
||||
"order": [[7, 'desc']],
|
||||
"columns": [
|
||||
{ "data": "name" }, // Name
|
||||
{ "data": "group" }, // Group
|
||||
|
||||
Reference in New Issue
Block a user