Skip to content

feat: rate limitting #26

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/nodejs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:

strategy:
matrix:
node-version: [10.x, 12.x]
node-version: [12.x, 14.x, 16.x]

steps:
- uses: actions/checkout@v2
Expand Down
29 changes: 29 additions & 0 deletions fetch.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
const agent = require('superagent')
const debug = require('debug')('linkchecker:fetch')

const BACKOFF_START = 200
const MAX_BACKOFF_COUNT = 5
const RATE_LIMIT_CODE = 429

function sleep (ms) {
return new Promise(resolve => setTimeout(resolve, ms))
}

module.exports = async function fetch (url, method, options) {
debug('fetching', url)

let backoffInterval = BACKOFF_START
for (let backoffTry = 1; backoffTry <= MAX_BACKOFF_COUNT + 1; backoffTry++) {
try {
return await agent[method](url).timeout({ response: options['http-timeout'] }).redirects(options['http-redirects'])
} catch (e){
if (e.status !== RATE_LIMIT_CODE || backoffTry === MAX_BACKOFF_COUNT) {
throw e
}
}

debug('backing off try no. ', backoffTry, ' for url ', url)
await sleep(backoffInterval)
backoffInterval = 2*backoffInterval
}
}
110 changes: 60 additions & 50 deletions link-checker.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ const path = require('path')
const debug = require('debug')('linkchecker')
const cheerio = require('cheerio')
const urlencode = require('urlencode')
const agent = require('superagent')
const mkdirp = require('mkdirp')
const ms = require('ms')
const walker = require('./walker')
const javadoc = require('./javadoc')
const fetch = require('./fetch')

var rcConfig = require('rc')('linkchecker', {});

Expand Down Expand Up @@ -39,10 +39,10 @@ module.exports = function(directory, options = {}, callback) {
options.overrides.set(new RegExp(pattern), opts)
options.overrides.delete(pattern)
})

const localLinks = new Map() // links to other local files, without an anchor
const localAnchorLinks = new Map() // links to other local files with an anchor
const localParentLinks = new Map() //
const localParentLinks = new Map() //
const localParentAnchorLinks = new Map()
const remoteLinks = new Map() // links to remote files, http(s), without an anchor)
const remoteAnchorLinks = new Map() // links to remote files, http(s) with an anchor
Expand All @@ -53,10 +53,10 @@ module.exports = function(directory, options = {}, callback) {

let fileCounter = 0
debug('scanning directory', directory)

function getOverrideFor(target) {
const url = target instanceof Error && target.response ? target.response.request.url // superagent error
: typeof target === "object" ? target.request.url // superagent response
: typeof target === "object" && target.request ? target.request.url // superagent response
: typeof target === "string" ? target // plain URL
: null

Expand All @@ -74,25 +74,26 @@ module.exports = function(directory, options = {}, callback) {
}
fileCounter += 1
localPages.add(filePath)
debug('scanning file', filePath)

const $ = cheerio.load(fileContent)

const links = $('body').find('a')
links.each(function(i, element) {
links.each(function() {
const $this = $(this)
let href = ($this.attr('href') || '').trim()
const linkSpecificOptions = getOverrideFor(href)
if (href.indexOf('mailto:') == 0) {

if (href.startsWith('mailto:')) {
return
}

if (href == '#' && linkSpecificOptions['allow-hash-href']) {
if (href === '#' && linkSpecificOptions['allow-hash-href']) {
debug('ignore hash href on', filePath)
return
}

if (href == null || href == '') {
if (href == null || href === '') {
debug('ignore invalid href "' + href + '" on', filePath)
return
}
Expand All @@ -105,20 +106,20 @@ module.exports = function(directory, options = {}, callback) {
debug('ignoring file', filePath)
return
}
}
}

if (href == '.') {
if (href === '.') {
debug('ignore link to itself via . from', filePath)
return
}
if (href.indexOf('javascript:') != -1) {
if (href.includes('javascript:')) {
debug('ignore javascript href: ' + href, filePath)
return
}

if (linkSpecificOptions['url-swap'] && linkSpecificOptions['url-swap'].length > 0) {
const found = linkSpecificOptions['url-swap'].forEach(line => {
// DO NOT use split(':') because it might be replaced with http://
// DO NOT use split(':') because it might be replaced with http://
const indexOfColon = line.indexOf(':')
const pattern = new RegExp(line.substr(0, indexOfColon))
const replacement = line.substr(indexOfColon + 1)
Expand All @@ -128,7 +129,7 @@ module.exports = function(directory, options = {}, callback) {
debug('replaced', href)
}
})
}
}
if (linkSpecificOptions['url-ignore'] && linkSpecificOptions['url-ignore'].length > 0) {
const found = linkSpecificOptions['url-ignore'].some(ignore => {
return href.match(ignore) != null
Expand All @@ -139,11 +140,11 @@ module.exports = function(directory, options = {}, callback) {
}
}

if (href.indexOf('http://') != 0 && href.indexOf('https://') != 0) {
if (!href.startsWith('http://') && !href.startsWith('https://')) { // Local link
if (options['external-only']) {
return
}
if (href.split('').pop() == '/') {
if (href.endsWith('/')) {
debug('append index.html to ' + href, filePath)
href = href + 'index.html'
} else if (href.substr(href.length - 2) == '..') {
Expand All @@ -153,14 +154,14 @@ module.exports = function(directory, options = {}, callback) {
debug('add index.html between / and # ' + href, filePath)
href = href.substr(0, href.indexOf('#')) + 'index.html' + href.substr(href.indexOf('#'))
}
} else {
} else { // Remote link
if (options['disable-external']) {
debug('ignore remote link' + href, filePath)
return
}

}

if (options.javadoc || (options['javadoc-external'] && options['javadoc-external'].length > 0)) {
href = javadoc(href, options.javadoc, options['javadoc-external'])
// some links have a special href attribute (<a xlink:href="...">)
Expand Down Expand Up @@ -188,15 +189,24 @@ module.exports = function(directory, options = {}, callback) {
href = url + '#' + urlencode.decode(anchor)
}

const resolvedHref = path.join(path.dirname(filePath), href)
let resolvedHref
if (href.startsWith('/')) {
// absolute paths are expected to be fully resolved to the root directory
// so only remove the leading slash
resolvedHref = href.substr(1)
} else {
resolvedHref = path.join(path.dirname(filePath), href)
}


debug('text content for ' + resolvedHref, $this.html())
if (href.indexOf('http://') == 0 || href.indexOf('https://') == 0) {
if (href.indexOf('#') == -1) {
if (href.startsWith('http://') || href.startsWith('https://')) {
if (!href.includes('#')) {
remoteLinks.set(href, filePath)
} else {
remoteAnchorLinks.set(href, filePath)
}
} else if (resolvedHref.indexOf('..') == 0) {
} else if (resolvedHref.startsWith('..')) {
// non http(s) links
if (options['limit-scope']) {
// TODO: same error will reported multiple times, consider to do the check and creating errors in the callback/
Expand All @@ -207,23 +217,20 @@ module.exports = function(directory, options = {}, callback) {
reason: 'target is out of scope'
})
} else {
if (href.indexOf('#') == -1) {
localParentLinks.set(resolvedHref, filePath)
} else {
if (href.startsWith('#')) {
localParentAnchorLinks.set(resolvedHref, filePath)
} else {
localParentLinks.set(resolvedHref, filePath)
}
}
} else if (href.indexOf('#') != -1) {
} else if (href.includes('#')) {
const resolvedAnchorHref = (href.indexOf('#') == 0 ? filePath + href : resolvedHref)
debug('adding localAnchorLink on page ' + filePath, resolvedAnchorHref)
localAnchorLinks.set(resolvedAnchorHref, filePath) // consider to use a set as value
} else {
debug('adding localLink on page ' + filePath, resolvedHref)
localLinks.set(resolvedHref, filePath) // consider to use a set as value
}



})

const anchors = $('html').find('[id], [name]')
Expand All @@ -243,7 +250,7 @@ module.exports = function(directory, options = {}, callback) {
})
}
})

}, async function() {
debug('localPages', localPages)
debug('remotePages', remoteLinks)
Expand All @@ -265,7 +272,7 @@ module.exports = function(directory, options = {}, callback) {

localAnchorLinks.forEach((sourcePage, link) => {
debug('lookup for', link)
const anchorCharIndex = link.indexOf('#')
const anchorCharIndex = link.indexOf('#')
const page = link.substr(0, anchorCharIndex)
const anchor = link.substr(anchorCharIndex + 1)
const resolvedPage = page === '' ? sourcePage : page
Expand Down Expand Up @@ -342,15 +349,16 @@ module.exports = function(directory, options = {}, callback) {

const remoteLinksArray = Array.from(remoteLinks.keys())
await Promise.all(remoteLinksArray.map(target => {
const linkSpecificOptions = getOverrideFor(target)
const linkSpecificOptions = getOverrideFor(target)
let method = 'head'
if (linkSpecificOptions['http-always-get']) {
method = 'get'
}
if (cache && cache[target] && cache[target].created + expiration > Date.now()) {
return new Promise(resolve => resolve(Object.assign({}, cache[target].payload, {cached: true})))
}
return agent[method](target).timeout({response: linkSpecificOptions['http-timeout']}).redirects(linkSpecificOptions['http-redirects'])

return fetch(target, method, linkSpecificOptions)
})
// map rejected to resolved promises
.map(p => p.catch(error => error)))
Expand Down Expand Up @@ -400,34 +408,36 @@ module.exports = function(directory, options = {}, callback) {

const remoteAnchorLinksArray = Array.from(remoteAnchorLinks.keys())
await Promise.all(remoteAnchorLinksArray.map(target => {
const linkSpecificOptions = getOverrideFor(target)
const linkSpecificOptions = getOverrideFor(target)
if (cache && cache[target] && cache[target].created + expiration > Date.now()) {
return new Promise(resolve => resolve(Object.assign({}, cache[target].payload, {cached: true})))
}
return agent.get(target).timeout({response: linkSpecificOptions['http-timeout']}).redirects(linkSpecificOptions['http-redirects'])
}

return fetch(target, 'get', linkSpecificOptions)
})
// map rejected to resolved promises
.map(p => p.catch(error => error)))
.then(responses => {
responses.forEach((response, index) => {
if(!response.request && !response.response.request) {
console.error(response)
}
const linkSpecificOptions = getOverrideFor(response)
// if (!response.request && !(response.response && response.response.request)) {
// console.error(response)
// }
const linkSpecificOptions = getOverrideFor(response)
const target = remoteAnchorLinksArray[index]
const source = remoteAnchorLinks.get(target)

if (response && response.statusCode && response.statusCode >= 200 && response.statusCode < 300) {
if (cache && !response.cached) {
cache[target] = {
payload: {
statusCode: response.statusCode,
text: response.text
},
created: Date.now()
}
}
if(!linkSpecificOptions['allow-hash-ref']) return
payload: {
statusCode: response.statusCode,
text: response.text
},
created: Date.now()
}
}

if(!linkSpecificOptions['allow-hash-ref']) return
const anchor = target.split('#')[1]
const $ = cheerio.load(response.text)
const anchors = $('body').find(`[id='${anchor}'], [name='${anchor}']`)
Expand Down
Loading