hatch_cc-blog_YYYYMMDD_HHMMSS/hatch_sets.json

{
    "ignore-unlisted": true
}

index.js

'use strict';

const Cheerio = require('cheerio');
const {dom, out, props, rule, ruleset, score, type} = require('fathom-web');
const Futils = require('fathom-web/utils');
const JSDOM = require('jsdom/lib/old-api');
const Libingester = require('libingester');
const url = require('url');
const util = require('util');
const {Vimeo} = require('vimeo');
const Youtubedl = require('youtube-dl');

Youtubedl.getInfo = util.promisify(Youtubedl.getInfo);

const feedURI = 'https://creativecommons.org/blog/feed/';

const vimeoClientID = '(fill in client ID here)';
const vimeoClientSecret = '(fill in client secret here)';

const ensureVimeoClient = (function () {
    let vimeo;
    return async function ensureVimeoClient() {
        if (vimeo)
            return vimeo;

        vimeo = new Vimeo(vimeoClientID, vimeoClientSecret);
        vimeo.generateClientCredentials =
            util.promisify(vimeo.generateClientCredentials.bind(vimeo));
        vimeo.request = util.promisify(vimeo.request.bind(vimeo));

        const {access_token: accessToken} =
            await vimeo.generateClientCredentials(['public']);
        vimeo.setAccessToken(accessToken);
        return vimeo;
    };
})();

async function getVideoInfo(vimeoID) {
    const vimeo = await ensureVimeoClient();
    const fields = ['description', 'license', 'link', 'name',
        'modified_time', 'pictures', 'privacy', 'release_time', 'tags'];
    return await vimeo.request({
        method: 'GET',
        path: `/videos/${vimeoID}?fields=${fields.join(',')}`
    });
}

function licenseFromVimeoLicense(license) {
    switch (license) {
    case 'by':
        return 'CC BY 3.0';
    case 'by-nc':
        return 'CC BY-NC 3.0';
    default:
        console.warn(`Unknown license ${license}`);
        return null;
    }
}

function tagsFromVimeoTags(tags) {
    return tags.map(({name}) => name);
}

function scoreByLength(fnode) {
    let length = Futils.inlineTextLength(fnode.element) * 2;
    if (Number.isNaN(length))
        length = 0;  // Penalize empty nodes
    return {
        score: length,
        note: {length},
    };
}

function byInverseLinkDensity(fnode) {
    const linkDensity = Futils.linkDensity(fnode,
        fnode.noteFor('paragraphish').length);
    if (Number.isNaN(linkDensity))
        return 1;
    return (1 - linkDensity) * 1.5;
}

function scoreByImageSize(fnode) {
    const img = fnode.element.querySelector('img');
    const width = img.getAttribute('width');
    const height = img.getAttribute('height');
    let length = Futils.inlineTextLength(fnode.element) * 2;
    if (Number.isNaN(length))
        length = 1;  // Don't penalize empty captions
    return {
        score: width && height ? width * height / 100 : 100,
        note: {length},
    };
}

const hasAncestor = (tagName, scoreIfHas) => fnode => {
    const lowerTag = tagName.toLowerCase();
    for (let element = fnode.element, parent;
        (parent = element.parentNode) != null &&
            parent.nodeType === parent.ELEMENT_NODE;
        element = parent) {
        if (element.tagName.toLowerCase() === lowerTag)
            return scoreIfHas;
    }
    return 1;
};

const rules = ruleset(
    // Isolate the actual blog post body text. Based on Fathom's example
    // Readability rules
    rule(dom('p,li,ol,ul,code,blockquote,pre,h1,h2,h3,h4,h5,h6'),
        props(scoreByLength).type('paragraphish')),
    rule(type('paragraphish'), score(byInverseLinkDensity)),
    rule(dom('p'), score(4.5).type('paragraphish')),

    // Tweaks for this particular blog
    rule(type('paragraphish'), score(hasAncestor('article', 10))),
    rule(dom('.entry-summary p'), score(0).type('paragraphish')),
    rule(dom('figure'), props(scoreByImageSize).type('paragraphish')),
    rule(dom('.jetpack-video-wrapper'), props(() => ({
        score: 100,
        note: {length: 1},
    })).type('paragraphish')),

    // Find the best cluster of paragraph-ish nodes
    rule(
        type('paragraphish').bestCluster({
            splittingDistance: 3,
            differentDepthCost: 6.5,
            differentTagCost: 2,
            sameTagCost: 0.5,
            strideCost: 0,
        }),
        out('content').allThrough(Futils.domSort)));

async function ingestArticle(hatch, {title, link, date, author}) {
    let $ = await Libingester.util.fetch_html(link);
    const baseURI = Libingester.util.get_doc_base_uri($, link);

    const imageURI = $('meta[property="og:image"]').attr('content');
    const synopsis = $('meta[property="og:description"]').attr('content');
    const lastModified = $('meta[property="article:modified_time"]')
        .attr('content');

    // Wordpress distinguishes predefined "categories" and free-form "tags".
    // We are likely to make Wordpress categories into featured sets, and
    // Wordpress tags non-featured. For now, we will mark the tag IDs of
    // Wordpress tags with "tag:".
    const wpCategory = $('meta[property="article:section"]')
        .attr('content');
    const wpTags = $('meta[property="article:tag"]')
        .map(function () { return $(this).attr('content'); })
        .get();
    const tags = wpTags.map(t => `tag:${t}`);
    tags.unshift(wpCategory);

    const dom = JSDOM.jsdom($.html(), {
        features: {ProcessExternalResources: false},
    });
    const facts = rules.against(dom);
    const html = facts.get('content')
        .filter(fnode => fnode.scoreFor('paragraphish') > 0)
        .map(fnode => fnode.element.outerHTML).join('');

    // Load the DOM back into Cheerio
    $ = Cheerio.load('<article>');
    $('article').append(html);

    const postAsset = new Libingester.BlogArticle();
    postAsset.set_title(title);
    postAsset.set_synopsis(synopsis);
    postAsset.set_canonical_uri(link);
    if (lastModified)
        postAsset.set_last_modified_date(lastModified);
    postAsset.set_date_published(date);
    postAsset.set_license('CC BY 4.0 International');
    postAsset.set_author(author);
    postAsset.set_read_more_text(`"${title}" by ${author}, used under CC BY 4.0 International / Reformatted from original`);
    postAsset.set_tags(tags);
    postAsset.set_custom_scss(`
        $title-font: 'Source Sans Variable';
        $body-font: 'Source Sans Variable';
        $context-font: 'Source Sans Variable';
        $support-font: 'Source Sans Variable';
        $primary-light-color: #fb7928;
        $primary-medium-color: #ee5b32;

        $accent-light-color: #049bce;
        $accent-dark-color: #464646;

        $background-light-color: white;
        $background-dark-color: #e9e9e9;
        @import '_default';
    `);

    const thumbnailAsset = Libingester.util.download_image(imageURI);
    hatch.save_asset(thumbnailAsset);
    postAsset.set_thumbnail(thumbnailAsset);

    // Replace bare <img>s with <figure>s
    $('p img').each(function () {
        const figure = $('<figure></figure>');
        const enclosingPara = $(this).parents('p')[0];
        figure.append($(this));
        figure.insertBefore(enclosingPara);
    });

    // Pick out a "main image": the first <figure>
    const figures = $('figure');
    if (figures.length) {
        const main = figures.first();
        const img = $('img', main);
        const mainImageAsset = Libingester.util.download_img(img, baseURI);
        hatch.save_asset(mainImageAsset);

        postAsset.set_main_image(mainImageAsset);
        postAsset.set_main_image_caption($('figcaption', main).text());

        $(main).remove();
    }

    // Save assets for any remaining <figure>s
    $('figure').each(function () {
        const img = $('img', this);
        const figureAsset = Libingester.util.download_img(img, baseURI);
        hatch.save_asset(figureAsset);
    });

    // Identify embedded videos, put them in a <figure>, and mark them for
    // downloading
    const videosToProcess = $('.jetpack-video-wrapper')
    .map(function () {
        const iframe = $('.embed-vimeo iframe', this).first();
        let figure;
        if (iframe) {
            figure = $('<figure></figure>');
            figure.append(iframe);
            figure = figure.insertAfter(this);
        }
        $(this).remove();
        return figure;
    })
    .get().filter(figure => !!figure);

    // Do some extra cleanup to minimize the size
    const all = $('*');
    all.removeAttr('class');
    all.removeAttr('style');
    const imgs = $('img');
    ['attachment-id', 'comments-opened', 'image-description', 'image-meta',
        'image-title', 'large-file', 'medium-file', 'orig-file',
        'orig-size', 'permalink']
        .forEach(data => imgs.removeAttr(`data-${data}`));
    imgs.removeAttr('srcset');  // For simplicity, only use one size
    imgs.removeAttr('sizes');

    await Promise.all(videosToProcess.map(async figure => {
        const iframe = figure.find('iframe');
        const {host, pathname} = url.parse(iframe.attr('src'));
        if (host !== 'player.vimeo.com') {
            $(iframe).remove();
            return;
        }

        const [,, vimeoID] = pathname.split('/');
        const {
            description, license, link, name, pictures, privacy, tags,
            release_time: releaseTime,
            modified_time: modifiedTime,
        } = await getVideoInfo(vimeoID);

        // Only download if Vimeo says it is allowed and the license allows
        // redistribution
        const freeLicense = licenseFromVimeoLicense(license);
        if (!privacy.download || !freeLicense) {
            $(iframe).remove();
            return;
        }

        // Try to get the smallest file size in a free codec
        const {url: downloadURL} = await Youtubedl.getInfo(link, [
            '--prefer-free-formats',
            '--format=worst',
        ], {
            maxBuffer: 500 * 1024,  // JSON info is big!
        });
        const video = Libingester.util.get_embedded_video_asset(iframe,
            downloadURL);
        video.set_title(name);
        video.set_synopsis(description);
        video.set_canonical_uri(link);
        video.set_last_modified_date(modifiedTime);
        video.set_date_published(releaseTime);
        video.set_license(freeLicense);
        video.set_tags(tagsFromVimeoTags(tags));

        const posterFrame = pictures.sizes.pop();
        const poster = Libingester.util.download_image(posterFrame.link);
        video.set_thumbnail(poster);

        hatch.save_asset(video);
        hatch.save_asset(poster);
    }));

    // Clean up useless <span>s with no attributes
    $('span').filter(function () {
        return Object.keys(this.attribs).length === 0;
    }).each(function () {
        $(this).replaceWith($(this).html());
    });

    postAsset.set_body($);
    postAsset.render();

    hatch.save_asset(postAsset);
}

async function main() {
    const hatch = new Libingester.Hatch('cc-blog', 'en');
    const paginator = Libingester.util.create_wordpress_paginator(feedURI);
    const items = await Libingester.util.fetch_rss_entries(paginator,
        Infinity, 90);
    await Promise.all(items.map(entry => ingestArticle(hatch, entry)));
    hatch.finish();
}

main();

package.json

{
  "name": "cc-ingester",
  "version": "0.0.0",
  "description": "Ingester for Creative Commons blog",
  "main": "index.js",
  "dependencies": {
    "cheerio": "^0.22.0",
    "fathom-web": "^2.2.0",
    "jsdom": "^11.6.2",
    "libingester": "^2.5.6",
    "vimeo": "^2.0.1",
    "youtube-dl": "^1.12.2"
  },
  "devDependencies": {
    "eslint": "^4.16.0"
  },
  "scripts": {
    "start": "node index.js",
    "test": "eslint ."
  },
  "author": "Philip Chimento",
  "license": "CC0-1.0"
}

sets.json

{
    "sets": [
        {
            "title": "About Creative Commons",
            "featured": true,
            "tags": ["EknSetObject"],
            "childTags": [
                "About CC"
            ]
        },
        {
            "title": "Around the World",
            "featured": true,
            "tags": ["EknSetObject"],
            "childTags": [
                "Global affiliates",
                "tag:Canada",
                "tag:Creative Commons Global Network",
                "tag:EU",
                "tag:EU-Mercosur",
                "tag:European Union",
                "tag:Ghana",
                "tag:Mercosur",
                "tag:Mexico",
                "tag:Portuguese",
                "tag:Spanish",
                "tag:Toronto",
                "tag:africa",
                "tag:chinese"
            ]
        },
        {
            "title": "Events",
            "featured": true,
            "tags": ["EknSetObject"],
            "childTags": [
                "Events"
            ]
        },
        {
            "title": "Interviews",
            "featured": true,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:cctalkswith"
            ]
        },
        {
            "title": "Policy, Advocacy, & Copyright Reform",
            "featured": true,
            "tags": ["EknSetObject"],
            "childTags": [
                "Policy / advocacy / copyright reform"
            ]
        },
        {
            "title": "Blog Posts",
            "featured": true,
            "tags": ["EknSetObject"],
            "childTags": [
                "Arts / culture",
                "Education / OER",
                "Global affiliates",
                "Legal tools / licenses",
                "Open access",
                "Weblog"
            ]
        },
        {
            "title": "Advocacy",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:advocacy"
            ]
        },
        {
            "title": "Art",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:art"
            ]
        },
        {
            "title": "Arts & Culture",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "Arts / culture"
            ]
        },
        {
            "title": "CC Licenses",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "Legal tools / licenses"
            ]
        },
        {
            "title": "Copyright Reform",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:#fixcopyright",
                "tag:copyright reform",
                "tag:eu copyright reform"
            ]
        },
        {
            "title": "Copyright Week",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:#copyrightweek",
                "tag:#coyrightweek",
                "tag:Copyright Week 2018"
            ]
        },
        {
            "title": "Education & OER",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "Education / OER"
            ]
        },
        {
            "title": "Emerging Media",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:AR",
                "tag:VR",
                "tag:augmented reality",
                "tag:emerging media",
                "tag:immersive media",
                "tag:virtual reality"
            ]
        },
        {
            "title": "Global Summit",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:CC Global Summit",
                "tag:CC summit",
                "tag:Global Summit",
                "tag:ccglobalsummit",
                "tag:ccsummit",
                "tag:ccsummit18",
                "tag:global summit 2018"
            ]
        },
        {
            "title": "Humans of the Commons",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:humans of the commons",
                "tag:humansofthecommons"
            ]
        },
        {
            "title": "Net Neutrality",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:#netneutrality",
                "tag:net neutrality",
                "tag:FCC"
            ]
        },
        {
            "title": "Open access",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "Open access"
            ]
        },
        {
            "title": "Policy",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:policy",
                "tag:public policy"
            ]
        },
        {
            "title": "Public Domain",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:public domain",
                "tag:CC0"
            ]
        },
        {
            "title": "Trade Agreements",
            "featured": false,
            "tags": ["EknSetObject"],
            "childTags": [
                "tag:free trade agreement",
                "tag:trade agreements"
            ]
        }
    ]
}

.eslintrc.json

{
    "env": {
        "es6": true,
        "node": true
    },
    "parserOptions": {
        "ecmaVersion": 2017
    },
    "extends": "eslint:recommended",
    "rules": {
        "no-console": 0,
        "indent": [
            "error",
            4,
            {
                "MemberExpression": "off"
            }
        ],
        "linebreak-style": [
            "error",
            "unix"
        ],
        "quotes": [
            "error",
            "single"
        ],
        "semi": [
            "error",
            "always"
        ]
    }
}

The results of the search are