简单解析XML和HTML_问答-阿里云开发者社区

    不是说网页就是有固定格式的XML嘛，那这样可以用XML的语法来解析嘛，不一定要用正则表达式的，太不好记，而且始终感觉正则是在处理文本的。我们可以把录制的控件当成一小块XML来处理的，一般也就是取一下TEXT或是其属性嘛，下面是简版的解析XML或是说HTML的方法。
    先来看下是怎么用的
Func main()         $html = "<html><head><meta charset='utf-8'   class='hello world'/><title id='133' class=1><title class=1 />hello<title class='ha'>world</title></title><title class='ha'>This is the second title</title></head></html>"         ;获得所包含的标签的html代码，如果有多个同样的标签，则只取第一个         $titleHtml = eGetHtmlByTag($html, "title")         J_MsgBox("首个title标签的html代码为:" & $titleHtml)         ;获取标签的属性值         $titleAttr = eGetAttr($titleHtml, "id")         J_MsgBox("首个title标签的id属性值为:" & $titleAttr)         ;获取标签在网页上显示出来的内容部分         $headText = eGetTextByTag($html, "head")         J_MsgBox("首个head标签的文本内容为:" & $headText)         ;获得所有所包含的标签的html代码         $titleHtmls = esGetHtmlByTag($html, "title")         ;J_MsgBox(UBound($titleHtml))         J_MsgBox("当前层的title标签共有:" & UBound($titleHtmls))         for $i = 1 to UBound($titleHtmls) - 1                 J_MsgBox("第" & $i & "个title标签为:" & $titleHtmls[$i])                 $subTitleHtmls = esGetHtmlByTag(eGetInnerHtml($titleHtmls[$i]), "title")                 for $j = 1 to UBound($subTitleHtmls) - 1                         J_MsgBox("内层第" & $j & "个标签:" & $subTitleHtmls[$j])                 Next         Next         ;获取所有class为demo的title标签         $titleHtmlDemos = esGetHtmlByTagAndAttr($html, "title", "class", "ha")         for $i = 1 to UBound($titleHtmlDemos) - 1                 J_MsgBox("class为ha的第" & $i & "个title标签为:" & $titleHtmlDemos[$i])         Next EndFunc   ;==>main
下面是对应的函数，效率可能不怎么高，大家也可以帮忙优化的
;用于获取指定标记的、指定的$attrName属性等于$attrValue的元素数组，下标从1到ubound-1 Func esGetHtmlByTagAndAttr($html, $tag, $attrName, $attrValue)         ;用于保存结果         dim $result         ;第一次获取当前element         $curElement = eGetHtmlByTag($html, $tag)         ;循环获取后面的element         while lengthString($curElement) > 3                 ;从源码中删掉当前的element                 $html = J_StringReplace($html, $curElement, "")                 ;去除可能会影响结果的分隔符                 $curElement = J_StringReplace($curElement, chr(10), "")                 $curElement = J_StringReplace($curElement, chr(13), "")                 ;获取当前得到的标签内部的字符串                 $innerString = subString($curElement, lengthString($tag) + 1, lengthString($curElement) - StringLen($tag) - 2)                 ;获取当前标签内部的标签                 $subElement = esGetHtmlByTagAndAttr($innerString, $tag, $attrName, $attrValue)                 ;如果里面有符合的标签会返回一个数组，                 for $i = 1 to UBound($subElement) - 1                         ;遍历数组，将所有的元素拼接到当前过程的结果字符串，最后再拆分成数组                         $result &= $subElement[$i] & chr(10)                 Next                 ;如果当前标签的指定的属性值就是我们需要的那个值                 If eGetAttr($curElement, $attrName) == $attrValue Then                         ;将当前标签加到当前过程的结果字符串中                         $result &= $curElement & chr(10)                 EndIf                 ;获取下一个element                 $curElement = eGetHtmlByTag($html, $tag)         WEnd         ;去掉最后一个分隔符         $result = subString($result, 1, lengthString($result) - 1)         ;返回结果         return J_String($result, "拆分", chr(10)) EndFunc   ;==>esGetHtmlByTagAndAttr ;用于获取指定标记的元素数组，下标从1到ubound-1 Func esGetHtmlByTag($html, $tag)         ;用于保存结果         dim $result         ;第一次获取当前element         $curElement = eGetHtmlByTag($html, $tag)         ;循环获取后面的element         while Stringlen($curElement) > 2                 ;从源码中删掉当前的element                 $html = J_StringReplace($html, $curElement, "")                 ;去除可能会影响结果的分隔符                 $curElement = J_StringReplace($curElement, chr(10), "")                 $curElement = J_StringReplace($curElement, chr(13), "")                 ;加分隔符                 $result &= $curElement & chr(10)                 ;获取下一个element                 $curElement = eGetHtmlByTag($html, $tag)         WEnd         ;去掉最后一个分隔符         $result = subString($result, 1, lengthString($result) - 1)         ;返回结果         return J_String($result, "拆分", chr(10)) EndFunc   ;==>esGetHtmlByTag ;获取当前标签的指定的属性值 Func eGetAttr($html, $attr)         ;看下有没有空格，有空格的第一个空格前面的就是标签名         $lStart = indexOf($html, " ")         ;取出标签名         $tag = subString($html, 2, $lStart - 1)         ;如果没有空格就直接返回就可以了         if not($lStart > 0) Then                 return ""         EndIf         Dim $result         ;如果是双标签的         if indexOf($html, "</" & $tag & ">") > 0 then                 ;得到开始标签的结束位置                 $lEnd = indexOf($html, ">")                 ;如果结束位置大于空格的位置则说明有属性                 if $lEnd > ($lStart + 2) then                         ;取出属性部分                         $result = subString($html, $lStart + 1, $lEnd - 1)                 EndIf         Else                 ;如果是单标签的,就获得结束标记的位置                 $lEnd = indexOf($html, ">")                 ;如果结束位置大于空格的位置则说明有属性                 if $lEnd > ($lStart + 2) then                         ;取出属性部分                         $result = subString($html, $lStart + 1, $lEnd - 1)                         $result = leftString($result, lastIndexOf($result, "/") - 1)                 EndIf         EndIf         ;使用属性值赋值的符号拆分成数组         $attrs = J_String($result, "拆分", "=")         ;遍历数组，拼接出所有的属性值对         for $i = 1 to UBound($attrs) - 2                 ;得到包含属性名的字符串                 $attrName = $attrs[$i]                 ;由于数组里面除了第一个以外，都是“上一个属性值下一个属性名”这样存在的                 ;所以要取一下空格的位置                 $attrNameStart = lastIndexOf($attrName, " ")                 ;如果是第一个元素位置为0，加1即可；如果不是第一个，则拿到正确的位置，加1即可                 $attrName = subString($attrName, $attrNameStart + 1, lengthString($attrName))                 ;当前属性名的值在下一个数组里面                 $attrValue = $attrs[$i + 1]                 ;如果不是最后一个元素的属性值，则直接取从头到空格之间的数据就可以了                 $attrValueEnd = lastIndexOf($attrValue, " ")                 ;由于最后一个元素是最后一个属性的值，所以不用再截取                 if $i <> UBound($attrs) - 2 then                         $attrValue = subString($attrValue, 1, $attrValueEnd - 1)                 endif                 ;如果当前属性就是需要的属性，就返回值                 if not(trimString($attrName) <> $attr) then                         $result = $attrValue                         ;如果结果的两头有引号，就去掉                         if leftString($result, 1) == "'" or leftString($result, 1) == '"' Then                                 $result = subString($result, 2, lengthString($result))                         EndIf                         if rightString($result, 1) == "'" or rightString($result, 1) == '"' Then                                 $result = subString($result, 1, lengthString($result) - 1)                         EndIf                         return $result                 EndIf         Next         return "" EndFunc   ;==>eGetAttr ;获取第一个指定标签的内容部分 Func eGetTextByTag($html, $tag)         $result = eGetHtmlByTag($html, $tag)         $lStart = indexOf($result, ">")         $lEnd = lastIndexOf($result, "<")         if $lEnd > $lStart Then                 $result = subString($result, $lStart + 1, $lEnd - 1)                 $result = J_EXP_Replace($result, "<[^<]*>", "")                 return $result         Else                 Return ""         EndIf EndFunc   ;==>eGetTextByTag Func eGetInnerHtml($html)         $lStart = indexOf($html, ">")         $spaceStart = indexOf($html, " ")         Dim $tag = ""         if $lStart > $spaceStart then                 $tag = subString($html, 2, $spaceStart - 1)         else                 $overLoc = lastIndexOf($html, "<")                 if $overLoc > $lStart Then                         return subString($html, indexOf($html, ">") + 1, lastIndexOf($html, "<") - 1)                 Else                         Return ""                 EndIf         EndIf         $tempString = eGetHtmlByTag($html, $tag)         $endLabel = rightString($html, Stringlen($tag) + 3)         j_log("endLabel:" & $endLabel)         if $endLabel == "</" & $tag & ">" then                 return subString($html, indexOf($html, ">") + 1, lastIndexOf($html, "<") - 1)         Else                 return ""         EndIf EndFunc   ;==>eGetInnerHtml ;获取指定标签的html代码 Func eGetHtmlByTag($html, $tag)         #cs                 1.先看否是单标签，如果是单标签就直接返回单标签的内容                 2.看下有多少个开始标签                 3.如果有多个开始标签，则跳过第一个开始标签后，去获取内圈的标签的内容                 4.跳过内圈的长度去找结束标签                 5.如果没有内圈就直接按现有的方法返回数据         #ce         $skip = 1         ;查找第一个开始标签的位置         $lStart = indexOf($html, '<' & $tag)         ;如果能找到开始标签         if $lStart > 0 Then                 ;从开始标签开始取到最后的字符串                 $result = subString($html, $lStart, StringLen($html))                 ;获取第一个结束标签的位置                 $lEnd = inString($result, "</" & $tag & ">", $skip)                 $single = J_EXP_Replace($result, "/\s+>", "/>")                 $singleOverLoc = indexOf($single, ">")                 $singleFullOverLoc = indexOf($single, "/>")                 J_LOG(">:" & $singleOverLoc & ",/>:" & $singleFullOverLoc & "," & $single)                 if Not($singleFullOverLoc + 1 == $singleOverLoc) Then                         J_LOG("no single")                         ;获取第一个开始标签到第一个结束标签之间的字符串                         $tempString = leftString($result, $lEnd + 2 + lengthString($tag))                         J_LOG("tempString:" & $tempString)                         if(StringLen($tempString) - StringLen(J_StringReplace($tempString, "<" & $tag, ""))) / (StringLen($tag) + 1) > 1 Then                                 ;获取第二个开始标签的位置                                 $skip = inString($tempString, "<" & $tag, StringLen($tag) + 1)                                 ;获取第二个开始标签后的内容                                 $subStringTemp = subString($result, $skip, StringLen($result))                                 ;获取内层的标签                                 $tempString = eGetHtmlByTag($subStringTemp, $tag)                                 ;计算要包裹住内层标签所需要跳过的位置                                 $skip = StringLen($tempString) + $skip                                 ;计算当前层的标签的结束标签的位置                                 $lEnd = inString($result, "</" & $tag & ">", $skip)                         EndIf                         ;截取当前层的标签的内容                         $result = leftString($result, $lEnd + 2 + StringLen($tag))                 Else                         J_LOG("single")                         ;如果是单标签，就直接取了                         $result = leftString($result, indexOf($result, "/>"))                         #cs                                 $single = J_EXP_Replace($html, "/\s+>", "/>")                                 $singleOverLoc = indexOf($single, ">")                                 $singleFullOverLoc = indexOf($single, "/>")                                 if $singleFullOverLoc + 2 > $singleOverLoc Then                                 return subString($html, 1, $singleOverLoc + 1)                                 EndIf                         #ce                 EndIf                 return $result         Else                 return 0         EndIf EndFunc   ;==>eGetHtmlByTag ;取子串,$start开始位置 $end结束位置 func subString($str, $start, $end)         return J_String($str, "取子串", $start & "," & ($end - $start + 1)) EndFunc   ;==>subString ;取左子串 func leftString($str, $count)         return J_String($str, "取左子串", $count) EndFunc   ;==>leftString ;取右子串 func rightString($str, $count)         return J_String($str, "取右子串", $count) EndFunc   ;==>rightString ;字符长度 Func lengthString($str)         return StringLen($str) EndFunc   ;==>lengthString ;查找指定字符所在位置 Func indexOf($str, $findStr)         return J_String($str, "是否包含", $findStr) EndFunc   ;==>indexOf ;查找指定字符所在位置 Func inString($str, $findStr, $start)         return stringinstr($str, $findStr, 0, 1, $start) EndFunc   ;==>inString ;删除头尾部的空格 Func trimString($str)         $length = lengthString($str)         ;删除尾部的空白         for $i = $length to 1 step -1                 $curString = subString($str, $i, $i)                 ;J_LOG($curString & "," & J_EXP($curString, "\s"))                 if lengthString(J_EXP($curString, "\s")) > 0 Then                         $str = subString($str, 1, $i - 1)                 Else                         ExitLoop                 EndIf         Next         ;删除头部的空白         while True                 $length = lengthString($str)                 $curString = subString($str, 1, 1)                 if lengthString(J_EXP($curString, "\s")) > 0 Then                         $str = subString($str, 2, $length)                 Else                         ExitLoop                 EndIf         WEnd         Return $str EndFunc   ;==>trimString Func Mbox($msg)         J_MsgBox($msg) EndFunc   ;==>Mbox ;反转给定的字符串 Func StrReverse($str)         ;获取字符串长度         $length = J_String($str, "长度", "")         ;定义用来盛放新字符串的变量         $nstr = ""         ;反转字符串         for $i = $length to 1 step -1                 $nstr = $nstr & J_String($str, "取子串", $i & "," & 1)         next         ;返回结果         return $nstr EndFunc   ;==>StrReverse Func lastIndexOf($str, $findStr)         ;反转字符串         $nstr = StrReverse($str)         ;获取字符串总长度         $length = J_String($str, "长度", "")         ;获取指定字符串在反转后的字符串中的位置         $loc = J_String($nstr, "是否包含", $findStr)         ;根据情况处理返回值         if $loc = 0 Then                 Return 0         Else                 return $length - $loc + 1         EndIf EndFunc   ;==>lastIndexOf Func lengthStringd($str)         return StringLen($str) EndFunc   ;==>lengthStringd

简单解析XML和HTML

相关课程

相关电子书

相关实验场景

热门

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

阿里云MVP

话题

直播

下载

镜像站

技术资料

插件

简单解析XML和HTML

相关课程

相关文章

相关电子书

相关实验场景

相关镜像