开发者社区> 问答> 正文

简单解析XML和HTML

    不是说网页就是有固定格式的XML嘛,那这样可以用XML的语法来解析嘛,不一定要用正则表达式的,太不好记,而且始终感觉正则是在处理文本的。我们可以把录制的控件当成一小块XML来处理的,一般也就是取一下TEXT或是其属性嘛,下面是简版的解析XML或是说HTML的方法。
    先来看下是怎么用的
Func main()
        $html = "<html><head><meta charset='utf-8'   class='hello world'/><title id='133' class=1><title class=1 />hello<title class='ha'>world</title></title><title class='ha'>This is the second title</title></head></html>"
        ;获得所包含的标签的html代码,如果有多个同样的标签,则只取第一个
        $titleHtml = eGetHtmlByTag($html, "title")
        J_MsgBox("首个title标签的html代码为:" & $titleHtml)

        ;获取标签的属性值
        $titleAttr = eGetAttr($titleHtml, "id")
        J_MsgBox("首个title标签的id属性值为:" & $titleAttr)
        ;获取标签在网页上显示出来的内容部分
        $headText = eGetTextByTag($html, "head")
        J_MsgBox("首个head标签的文本内容为:" & $headText)
        ;获得所有所包含的标签的html代码
        $titleHtmls = esGetHtmlByTag($html, "title")
        ;J_MsgBox(UBound($titleHtml))
        J_MsgBox("当前层的title标签共有:" & UBound($titleHtmls))
        for $i = 1 to UBound($titleHtmls) - 1

                J_MsgBox("第" & $i & "个title标签为:" & $titleHtmls[$i])

                $subTitleHtmls = esGetHtmlByTag(eGetInnerHtml($titleHtmls[$i]), "title")
                for $j = 1 to UBound($subTitleHtmls) - 1
                        J_MsgBox("内层第" & $j & "个标签:" & $subTitleHtmls[$j])
                Next

        Next
        ;获取所有class为demo的title标签
        $titleHtmlDemos = esGetHtmlByTagAndAttr($html, "title", "class", "ha")
        for $i = 1 to UBound($titleHtmlDemos) - 1
                J_MsgBox("class为ha的第" & $i & "个title标签为:" & $titleHtmlDemos[$i])
        Next
EndFunc   ;==>main

下面是对应的函数,效率可能不怎么高,大家也可以帮忙优化的
;用于获取指定标记的、指定的$attrName属性等于$attrValue的元素数组,下标从1到ubound-1
Func esGetHtmlByTagAndAttr($html, $tag, $attrName, $attrValue)
        ;用于保存结果
        dim $result
        ;第一次获取当前element
        $curElement = eGetHtmlByTag($html, $tag)
        ;循环获取后面的element
        while lengthString($curElement) > 3
                ;从源码中删掉当前的element
                $html = J_StringReplace($html, $curElement, "")
                ;去除可能会影响结果的分隔符
                $curElement = J_StringReplace($curElement, chr(10), "")
                $curElement = J_StringReplace($curElement, chr(13), "")
                ;获取当前得到的标签内部的字符串
                $innerString = subString($curElement, lengthString($tag) + 1, lengthString($curElement) - StringLen($tag) - 2)
                ;获取当前标签内部的标签
                $subElement = esGetHtmlByTagAndAttr($innerString, $tag, $attrName, $attrValue)
                ;如果里面有符合的标签会返回一个数组,
                for $i = 1 to UBound($subElement) - 1
                        ;遍历数组,将所有的元素拼接到当前过程的结果字符串,最后再拆分成数组
                        $result &= $subElement[$i] & chr(10)
                Next
                ;如果当前标签的指定的属性值就是我们需要的那个值
                If eGetAttr($curElement, $attrName) == $attrValue Then
                        ;将当前标签加到当前过程的结果字符串中
                        $result &= $curElement & chr(10)
                EndIf
                ;获取下一个element
                $curElement = eGetHtmlByTag($html, $tag)
        WEnd
        ;去掉最后一个分隔符
        $result = subString($result, 1, lengthString($result) - 1)
        ;返回结果
        return J_String($result, "拆分", chr(10))
EndFunc   ;==>esGetHtmlByTagAndAttr



;用于获取指定标记的元素数组,下标从1到ubound-1
Func esGetHtmlByTag($html, $tag)
        ;用于保存结果
        dim $result
        ;第一次获取当前element
        $curElement = eGetHtmlByTag($html, $tag)
        ;循环获取后面的element
        while Stringlen($curElement) > 2
                ;从源码中删掉当前的element
                $html = J_StringReplace($html, $curElement, "")
                ;去除可能会影响结果的分隔符
                $curElement = J_StringReplace($curElement, chr(10), "")
                $curElement = J_StringReplace($curElement, chr(13), "")
                ;加分隔符
                $result &= $curElement & chr(10)
                ;获取下一个element
                $curElement = eGetHtmlByTag($html, $tag)
        WEnd
        ;去掉最后一个分隔符
        $result = subString($result, 1, lengthString($result) - 1)
        ;返回结果
        return J_String($result, "拆分", chr(10))
EndFunc   ;==>esGetHtmlByTag

;获取当前标签的指定的属性值
Func eGetAttr($html, $attr)
        ;看下有没有空格,有空格的第一个空格前面的就是标签名
        $lStart = indexOf($html, " ")
        ;取出标签名
        $tag = subString($html, 2, $lStart - 1)
        ;如果没有空格就直接返回就可以了
        if not($lStart > 0) Then
                return ""
        EndIf
        Dim $result
        ;如果是双标签的
        if indexOf($html, "</" & $tag & ">") > 0 then
                ;得到开始标签的结束位置
                $lEnd = indexOf($html, ">")
                ;如果结束位置大于空格的位置则说明有属性
                if $lEnd > ($lStart + 2) then
                        ;取出属性部分
                        $result = subString($html, $lStart + 1, $lEnd - 1)
                EndIf

        Else
                ;如果是单标签的,就获得结束标记的位置
                $lEnd = indexOf($html, ">")
                ;如果结束位置大于空格的位置则说明有属性
                if $lEnd > ($lStart + 2) then
                        ;取出属性部分
                        $result = subString($html, $lStart + 1, $lEnd - 1)
                        $result = leftString($result, lastIndexOf($result, "/") - 1)
                EndIf
        EndIf
        ;使用属性值赋值的符号拆分成数组
        $attrs = J_String($result, "拆分", "=")
        ;遍历数组,拼接出所有的属性值对
        for $i = 1 to UBound($attrs) - 2
                ;得到包含属性名的字符串
                $attrName = $attrs[$i]
                ;由于数组里面除了第一个以外,都是“上一个属性值 下一个属性名”这样存在的
                ;所以要取一下空格的位置
                $attrNameStart = lastIndexOf($attrName, " ")
                ;如果是第一个元素位置为0,加1即可;如果不是第一个,则拿到正确的位置,加1即可
                $attrName = subString($attrName, $attrNameStart + 1, lengthString($attrName))
                ;当前属性名的值在下一个数组里面
                $attrValue = $attrs[$i + 1]
                ;如果不是最后一个元素的属性值,则直接取从头到空格之间的数据就可以了
                $attrValueEnd = lastIndexOf($attrValue, " ")
                ;由于最后一个元素是最后一个属性的值,所以不用再截取
                if $i <> UBound($attrs) - 2 then
                        $attrValue = subString($attrValue, 1, $attrValueEnd - 1)
                endif
                ;如果当前属性就是需要的属性,就返回值
                if not(trimString($attrName) <> $attr) then
                        $result = $attrValue
                        ;如果结果的两头有引号,就去掉
                        if leftString($result, 1) == "'" or leftString($result, 1) == '"' Then
                                $result = subString($result, 2, lengthString($result))
                        EndIf
                        if rightString($result, 1) == "'" or rightString($result, 1) == '"' Then
                                $result = subString($result, 1, lengthString($result) - 1)
                        EndIf
                        return $result
                EndIf
        Next
        return ""
EndFunc   ;==>eGetAttr

;获取第一个指定标签的内容部分
Func eGetTextByTag($html, $tag)
        $result = eGetHtmlByTag($html, $tag)
        $lStart = indexOf($result, ">")
        $lEnd = lastIndexOf($result, "<")
        if $lEnd > $lStart Then
                $result = subString($result, $lStart + 1, $lEnd - 1)
                $result = J_EXP_Replace($result, "<[^<]*>", "")
                return $result
        Else
                Return ""
        EndIf
EndFunc   ;==>eGetTextByTag


Func eGetInnerHtml($html)
        $lStart = indexOf($html, ">")
        $spaceStart = indexOf($html, " ")
        Dim $tag = ""
        if $lStart > $spaceStart then
                $tag = subString($html, 2, $spaceStart - 1)
        else
                $overLoc = lastIndexOf($html, "<")
                if $overLoc > $lStart Then
                        return subString($html, indexOf($html, ">") + 1, lastIndexOf($html, "<") - 1)
                Else
                        Return ""
                EndIf
        EndIf

        $tempString = eGetHtmlByTag($html, $tag)
        $endLabel = rightString($html, Stringlen($tag) + 3)
        j_log("endLabel:" & $endLabel)
        if $endLabel == "</" & $tag & ">" then
                return subString($html, indexOf($html, ">") + 1, lastIndexOf($html, "<") - 1)
        Else
                return ""
        EndIf

EndFunc   ;==>eGetInnerHtml
;获取指定标签的html代码
Func eGetHtmlByTag($html, $tag)
        #cs
                1.先看否是单标签,如果是单标签就直接返回单标签的内容
                2.看下有多少个开始标签
                3.如果有多个开始标签,则跳过第一个开始标签后,去获取内圈的标签的内容
                4.跳过内圈的长度去找结束标签
                5.如果没有内圈就直接按现有的方法返回数据
        #ce
        $skip = 1
        ;查找第一个开始标签的位置
        $lStart = indexOf($html, '<' & $tag)
        ;如果能找到开始标签
        if $lStart > 0 Then
                ;从开始标签开始取到最后的字符串
                $result = subString($html, $lStart, StringLen($html))
                ;获取第一个结束标签的位置
                $lEnd = inString($result, "</" & $tag & ">", $skip)

                $single = J_EXP_Replace($result, "/\s+>", "/>")
                $singleOverLoc = indexOf($single, ">")
                $singleFullOverLoc = indexOf($single, "/>")
                J_LOG(">:" & $singleOverLoc & ",/>:" & $singleFullOverLoc & "," & $single)
                if Not($singleFullOverLoc + 1 == $singleOverLoc) Then
                        J_LOG("no single")
                        ;获取第一个开始标签到第一个结束标签之间的字符串
                        $tempString = leftString($result, $lEnd + 2 + lengthString($tag))
                        J_LOG("tempString:" & $tempString)
                        if(StringLen($tempString) - StringLen(J_StringReplace($tempString, "<" & $tag, ""))) / (StringLen($tag) + 1) > 1 Then
                                ;获取第二个开始标签的位置
                                $skip = inString($tempString, "<" & $tag, StringLen($tag) + 1)
                                ;获取第二个开始标签后的内容
                                $subStringTemp = subString($result, $skip, StringLen($result))
                                ;获取内层的标签
                                $tempString = eGetHtmlByTag($subStringTemp, $tag)
                                ;计算要包裹住内层标签所需要跳过的位置
                                $skip = StringLen($tempString) + $skip
                                ;计算当前层的标签的结束标签的位置
                                $lEnd = inString($result, "</" & $tag & ">", $skip)
                        EndIf
                        ;截取当前层的标签的内容
                        $result = leftString($result, $lEnd + 2 + StringLen($tag))
                Else
                        J_LOG("single")
                        ;如果是单标签,就直接取了
                        $result = leftString($result, indexOf($result, "/>"))
                        #cs
                                $single = J_EXP_Replace($html, "/\s+>", "/>")
                                $singleOverLoc = indexOf($single, ">")
                                $singleFullOverLoc = indexOf($single, "/>")
                                if $singleFullOverLoc + 2 > $singleOverLoc Then
                                return subString($html, 1, $singleOverLoc + 1)
                                EndIf
                        #ce
                EndIf
                return $result
        Else
                return 0
        EndIf
EndFunc   ;==>eGetHtmlByTag



;取子串,$start开始位置 $end结束位置
func subString($str, $start, $end)
        return J_String($str, "取子串", $start & "," & ($end - $start + 1))
EndFunc   ;==>subString

;取左子串
func leftString($str, $count)
        return J_String($str, "取左子串", $count)
EndFunc   ;==>leftString

;取右子串
func rightString($str, $count)
        return J_String($str, "取右子串", $count)
EndFunc   ;==>rightString

;字符长度
Func lengthString($str)
        return StringLen($str)
EndFunc   ;==>lengthString

;查找指定字符所在位置
Func indexOf($str, $findStr)
        return J_String($str, "是否包含", $findStr)
EndFunc   ;==>indexOf

;查找指定字符所在位置
Func inString($str, $findStr, $start)
        return stringinstr($str, $findStr, 0, 1, $start)
EndFunc   ;==>inString

;删除头尾部的空格
Func trimString($str)
        $length = lengthString($str)
        ;删除尾部的空白
        for $i = $length to 1 step -1
                $curString = subString($str, $i, $i)
                ;J_LOG($curString & "," & J_EXP($curString, "\s"))
                if lengthString(J_EXP($curString, "\s")) > 0 Then
                        $str = subString($str, 1, $i - 1)
                Else
                        ExitLoop
                EndIf
        Next
        ;删除头部的空白
        while True
                $length = lengthString($str)
                $curString = subString($str, 1, 1)
                if lengthString(J_EXP($curString, "\s")) > 0 Then
                        $str = subString($str, 2, $length)
                Else
                        ExitLoop
                EndIf
        WEnd
        Return $str
EndFunc   ;==>trimString

Func Mbox($msg)
        J_MsgBox($msg)

EndFunc   ;==>Mbox
;反转给定的字符串
Func StrReverse($str)
        ;获取字符串长度
        $length = J_String($str, "长度", "")
        ;定义用来盛放新字符串的变量
        $nstr = ""
        ;反转字符串
        for $i = $length to 1 step -1
                $nstr = $nstr & J_String($str, "取子串", $i & "," & 1)
        next
        ;返回结果
        return $nstr
EndFunc   ;==>StrReverse

Func lastIndexOf($str, $findStr)
        ;反转字符串
        $nstr = StrReverse($str)
        ;获取字符串总长度
        $length = J_String($str, "长度", "")
        ;获取指定字符串在反转后的字符串中的位置
        $loc = J_String($nstr, "是否包含", $findStr)
        ;根据情况处理返回值
        if $loc = 0 Then
                Return 0
        Else
                return $length - $loc + 1
        EndIf
EndFunc   ;==>lastIndexOf

Func lengthStringd($str)
        return StringLen($str)
EndFunc   ;==>lengthStringd





展开
收起
水晶心泉 2017-03-05 14:00:22 2367 0
1 条回答
写回答
取消 提交回答
  • Re简单解析XML和HTML
    学习了
    2017-03-11 23:06:39
    赞同 展开评论 打赏
问答排行榜
最热
最新

相关电子书

更多
神龙云服务器产品及技术深度解析 立即下载
弹性创造价值:基于ECS的最佳性价比实践解析 立即下载
又快又稳:阿里云下一代虚拟交换机解析 立即下载

相关镜像