OiO.lk Community platform!

Oio.lk is an excellent forum for developers, providing a wide range of resources, discussions, and support for those in the developer community. Join oio.lk today to connect with like-minded professionals, share insights, and stay updated on the latest trends and technologies in the development field.
  You need to log in or register to access the solved answers to this problem.
  • You have reached the maximum number of guest views allowed
  • Please register below to remove this limitation

Get all public packages in the npm registry

  • Thread starter Thread starter Martin
  • Start date Start date
M

Martin

Guest
Code:
const baseUrl = 'https://replicate.npmjs.com/_all_docs';
const outputFile = 'npm_packages_metadata_7.json';
const pageSize = 1000;
const maxRetries = 3;
const retryDelay = 5000; // Delay in milliseconds (5 seconds)

let hasMore = true;
let lastDocId = '';

function delay(ms: number) {
    return new Promise(resolve => setTimeout(resolve, ms));
}

async function fetchPage(startkey: string | null, outputFile: string, retryCount = 0) {
    try {
        console.time(`Step ${step + 1} time`);
        const queryParams = new URLSearchParams({
            include_docs: 'true',
            limit: pageSize.toString(),
        });
        if (startkey) {
            queryParams.set('startkey', JSON.stringify(startkey));
            queryParams.set('skip', '1'); // Skip the first item to avoid duplication
        }

        const url = `${baseUrl}?${queryParams.toString()}`;
        console.log(`[${new Date().toISOString()}] Fetching: ${url}`);

        const response = await fetch(url);
        console.log(`[${new Date().toISOString()}] Fetched response status ${response.status} statusText: ${response.statusText}, size: ${response.size} , ok: ${response.ok} redirected: ${response.redirected}`);
        const data: any = await response.json();
        if (data.rows.length === pageSize) {
            hasMore = true;
        } else {
            hasMore = false;
        }

        console.log(`[${new Date().toISOString()}] Fetched ${data.rows.length} rows. HasMore: ${hasMore}`);

        // Process and save the data
        const docs = data.rows.map((row: any) => row.doc);
        if (docs.length > 0) {
            docs.forEach((doc: any, index: any) => {
                const jsonString = JSON.stringify(doc, null, 2);
                appendFileSync(outputFile, jsonString + (hasMore || index < docs.length - 1 ? ',\n' : '\n'));
            });
            // Update the last document ID
            lastDocId = docs[docs.length - 1]._id;
        }

        step = step + 1;
        console.timeEnd(`Step ${step} time`);

        // Check if there's more data to fetch
        if (hasMore) {
            lastDocId = data.rows[data.rows.length - 1].id;
            console.log(`[${new Date().toISOString()}] Last key: ${lastDocId}`);

            await fetchPage(lastDocId, outputFile);
        }
    } catch (error) {
        console.log(`[${new Date().toISOString()}] An error occurred while fetching the data:`, error);
        if (retryCount < maxRetries) {
            console.log(`[${new Date().toISOString()}] Error details:`, error);
            console.log(`[${new Date().toISOString()}] Retrying (${retryCount + 1}/${maxRetries}) after ${retryDelay}ms...`);
            await delay(retryDelay);
            await fetchPage(startkey, outputFile, retryCount + 1);
        } else {
            console.log(`[${new Date().toISOString()}] Max retries reached. Skipping this page.`);
        }
    }

}

async function fetchAndSaveNpmDocs() {
    console.time("Total time");
    writeFileSync(outputFile, '[\n');  // Start the JSON array
    try {
        await fetchPage(null, outputFile);
    } catch (error) {
        console.log('An error occurred while fetching the data:', error);
    } finally {
        appendFileSync(outputFile, '{}]\n');  // Add an empty object to handle the trailing comma
        console.log('All package metadata has been saved to', outputFile);
        console.timeEnd("Total time");
    }
}

export {
    fetchAndSaveNpmDocs
}

For research purposes, I'd like to get all the packages that are available on npm. How can I do this? I tried using different scripts, with this one I managed to download around ~9 GB of data but it's not stable. And is getting slower with time. Is there a more efficient way to do it?

I saw it even in this paper https://arxiv.org/pdf/2112.10165 that they did a snapshot of the npm registry but didn't provide details. I contacted the authors but still no response from them after a month.

<pre><code>const baseUrl = 'https://replicate.npmjs.com/_all_docs';
const outputFile = 'npm_packages_metadata_7.json';
const pageSize = 1000;
const maxRetries = 3;
const retryDelay = 5000; // Delay in milliseconds (5 seconds)

let hasMore = true;
let lastDocId = '';

function delay(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms));
}

async function fetchPage(startkey: string | null, outputFile: string, retryCount = 0) {
try {
console.time(`Step ${step + 1} time`);
const queryParams = new URLSearchParams({
include_docs: 'true',
limit: pageSize.toString(),
});
if (startkey) {
queryParams.set('startkey', JSON.stringify(startkey));
queryParams.set('skip', '1'); // Skip the first item to avoid duplication
}

const url = `${baseUrl}?${queryParams.toString()}`;
console.log(`[${new Date().toISOString()}] Fetching: ${url}`);

const response = await fetch(url);
console.log(`[${new Date().toISOString()}] Fetched response status ${response.status} statusText: ${response.statusText}, size: ${response.size} , ok: ${response.ok} redirected: ${response.redirected}`);
const data: any = await response.json();
if (data.rows.length === pageSize) {
hasMore = true;
} else {
hasMore = false;
}

console.log(`[${new Date().toISOString()}] Fetched ${data.rows.length} rows. HasMore: ${hasMore}`);

// Process and save the data
const docs = data.rows.map((row: any) => row.doc);
if (docs.length > 0) {
docs.forEach((doc: any, index: any) => {
const jsonString = JSON.stringify(doc, null, 2);
appendFileSync(outputFile, jsonString + (hasMore || index < docs.length - 1 ? ',\n' : '\n'));
});
// Update the last document ID
lastDocId = docs[docs.length - 1]._id;
}

step = step + 1;
console.timeEnd(`Step ${step} time`);

// Check if there's more data to fetch
if (hasMore) {
lastDocId = data.rows[data.rows.length - 1].id;
console.log(`[${new Date().toISOString()}] Last key: ${lastDocId}`);

await fetchPage(lastDocId, outputFile);
}
} catch (error) {
console.log(`[${new Date().toISOString()}] An error occurred while fetching the data:`, error);
if (retryCount < maxRetries) {
console.log(`[${new Date().toISOString()}] Error details:`, error);
console.log(`[${new Date().toISOString()}] Retrying (${retryCount + 1}/${maxRetries}) after ${retryDelay}ms...`);
await delay(retryDelay);
await fetchPage(startkey, outputFile, retryCount + 1);
} else {
console.log(`[${new Date().toISOString()}] Max retries reached. Skipping this page.`);
}
}

}

async function fetchAndSaveNpmDocs() {
console.time("Total time");
writeFileSync(outputFile, '[\n'); // Start the JSON array
try {
await fetchPage(null, outputFile);
} catch (error) {
console.log('An error occurred while fetching the data:', error);
} finally {
appendFileSync(outputFile, '{}]\n'); // Add an empty object to handle the trailing comma
console.log('All package metadata has been saved to', outputFile);
console.timeEnd("Total time");
}
}

export {
fetchAndSaveNpmDocs
}
</code></pre>
<p>For research purposes, I'd like to get all the packages that are available on npm. How can I do this? I tried using different scripts, with this one I managed to download around ~9 GB of data but it's not stable. And is getting slower with time. Is there a more efficient way to do it?</p>
<p>I saw it even in this paper <a href="https://arxiv.org/pdf/2112.10165" rel="nofollow noreferrer">https://arxiv.org/pdf/2112.10165</a> that they did a snapshot of the npm registry but didn't provide details. I contacted the authors but still no response from them after a month.</p>
 

Latest posts

A
Replies
0
Views
1
Alfredo Augusto Petri
A
Top