Node.js使用cheerio解析html-阿里云开发者社区

Node.js使用cheerio解析html

2023-02-04 115

版权

本文内容由阿里云实名注册用户自发贡献，版权归原作者所有，阿里云开发者社区不拥有其著作权，亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容，填写侵权投诉表单进行举报，一经查实，本社区将立刻删除涉嫌侵权内容。

简介： Node.js使用cheerio解析html

cheerio语法类似jQuery

doc

doc-zh: https://github.com/cheeriojs/cheerio/wiki/Chinese-README

安装

npm i cheerio

代码实例

const cheerio = require("cheerio");

const doc = cheerio.load('<h2 class="title">Hello world</h2>', {
xmlMode: true,
decodeEntities: false
});

doc("h2.title").text("Hello there!");
doc("h2").addClass("welcome");

console.log(doc.xml());
// <h2 class="title welcome">Hello there!</h2>

项目实战

import cheerio from "cheerio";

/**
 *  将外链图片转为本站连接
  @param {} html
 * @returns
 */
export async function replaceImage(html) {

  const doc = cheerio.load(html, {
    xmlMode: true,
    decodeEntities: false
  });

  let elems = [];

  // each不等待promise
  doc("img").each(function(index, elem) {
    elems.push(doc(this));
  });

  for (let elem of elems) {
    let src = elem.attr("src");

    if (src && src.indexOf(process.env.VUE_APP_BASE_URL) == -1) {
      // 修改为自己的替换方法
      let imageSrc = await saveImage(src);

      if (imageSrc) {
        elem.attr("src", imageSrc);
      }
    }
  }

  return doc.xml();
}

/**
 *  提取图片连接
  @param {} html
 * @returns
 */
export function extractImages(html) {

  const doc = cheerio.load(html, {
    xmlMode: true,
    decodeEntities: false
  });

  let images = [];

  doc("img").each(function(index, elem) {
    let src = doc(this).attr("src");
    if (src) {
      images.push(src);
    }
  });

  return images;
}



/**
 *  移除style属性
  @param {} html
 * @returns
 */
export function removeStyle(html) {
  const doc = cheerio.load(html, {
    xmlMode: true,
    decodeEntities: false
  });

  doc("*[style]").removeAttr("style");

  return doc.xml();
}

xml和html

const cheerio = require("cheerio");

const doc = cheerio.load("<a></a>");

// xml模式输出，a标签被处理成自闭合标签
console.log(doc.xml());
// <html><head/><body><a/></body></html>

// html格式输出，a标签没有被处理
console.log(doc.html());
// <html><head></head><body><a></a></body></html>

如果只是使用html片段，可以自己处理html返回后的结果

const cheerio = require("cheerio");

function getDom(html) {
  return cheerio.load(html);
}

function toHtml(doc) {
  // 将生成文本多余的标签去除
  let html = doc.html();
  let pattern = /<html><head></head><body>([\s\S]*)</body></html>/;
  let res = html.match(pattern);
  return res[1];
}

console.log(toHtml(getDom("<a></a>")));
// <a></a>

            </div>

Node.js使用cheerio解析html

xml和html

热门文章

最新文章

相关课程

相关电子书

推荐镜像

探索云世界

热门

云计算

大数据

云原生

人工智能

数据库

开发与运维

活动广场

任务中心

训练营

直播

乘风者计划

下载

镜像站

技术资料

Node.js使用cheerio解析html

xml和html

热门文章

最新文章

相关课程

相关电子书

推荐镜像