GAS用パーサー

01. <div>
02. <div>
03. <span>
04. </span>
05. </div>
06. </div>
07. <div>
08. <div>
09. </div>
10. </div>


<div> (01. )の終了タグである</div> (06. )を見つける方法

  1. (01. )と(02. 以降)で分割(それぞれbef, aft
  2. <div> (01. )と同種(この場合はdiv)の開始タグと終了タグを併せて検知できる正規表現(この時は開始・終了を区別しなくてよい)を生成し、aftを分割(list
  3. listを参照しながらaft内に登場する開始タグと終了タグの順番を判別、カウントを開始(カウントはそれぞれst_cnt, ed_cnt
    • befに既に開始タグが一つあるのでst_cntの初期値は1、st_cntは0とする
  4. st_cnted_cntの値が一致した時点でカウントを終了し、インデックスを記録(sp_idx
    • forEachを使うと楽にインデックスを参照できる
  5. list.slice(0, sp_idx)した配列を繋ぎ合わせたものが<div> (01. )</div> (06. )要素・ノード、list[sp_idx]</div> (06. )list.slice(sp_idx + 1)した配列を繋ぎ合わせたものが<div> (01. )</div> (06. )兄弟要素・ノード
function HTMLparser(html_str) {
    if(!(this instanceof HTMLparser)) {
        return new HTMLparser(html_str);
    }else {
        this.html = [];
        this.attr = {};
        this.tags = {};
        var html_obj = {tree: []};
        function per(str, ary, parent, self) {
            var obj = {tagname: "_Node", attr: null, innerText: "", tree: [], parent: null};
            obj.parent = parent;
            var matchTag = str.match(/<([a-zA-Z][^\t\n\r\f \/>\x00]*?)(| [a-zA-Z][^\t\n\r\f>\x00]*?[^\/])>([\s\S]*?)$/);
            //[0]: 全体, [1]: タグ名, [2]: 属性, [3]: その開始タグ以降のテキスト
 
            function sameTagBothReg(tagname) {
                return new RegExp("(<" + tagname + ">|<" + tagname + " [a-zA-Z][^\t\n\r\f>\x00]*?[^\/]>|<\\/" + tagname + ">)");
            }
            function sameTagStartReg(tagname) {
                return new RegExp("(<" + tagname + ">|<" + tagname + " [a-zA-Z][^\t\n\r\f>\x00]*?[^\/]>)");
            }
            function sameTagEndReg(tagname, count) {
                return new RegExp("(<\\/" + tagname + ">)");
            }
            var attrReg = /([a-zA-Z][^\t\n\r\f >\x00]*=\".*?\")/g;
            var attrReg_g = /([a-zA-Z][^\t\n\r\f >\x00]*)=\"(.*?)\"/g;
            if(matchTag) {
                var attr_obj = {};
                var attr_node_list = matchTag[2].split(attrReg).filter(function(r) {return r.match(attrReg);})
                attr_node_list.forEach(function(r) {
                    var a = r.split(attrReg_g);
 
                    if(!self.attr[a[1]]) {
                        self.attr[a[1]] = {};
                    }
 
                    var v;
                    if(a[1] == "class") {
                        v = a[2].split(" ");
                        v.forEach(function(r) {
                            if(!self.attr[a[1]][r]) {
                                self.attr[a[1]][r] = [];
                            }
                            self.attr[a[1]][r].push(obj);
                        });
                    }else {
                        v = a[2];
                        if(!self.attr[a[1]][v]) {
                            self.attr[a[1]][v] = [];
                        }
                        self.attr[a[1]][v].push(obj);
                    }
 
                    attr_obj[a[1]] = v;
                });
                obj.attr = attr_obj;
                obj.tagname = matchTag[1];
                var st_cnt = 1;
                var ed_cnt = 0;
 
                var sp_idx = 0;
                var splitted_same_tag = matchTag[3].split(sameTagBothReg(matchTag[1]));
                splitted_same_tag.forEach(function(v, i) {
                    if(sp_idx) {
                        return;
                    }
                    else if(v.match(sameTagStartReg(matchTag[1]))) {
                        st_cnt++;
                        return;
                    }
                    else if(v.match(sameTagEndReg(matchTag[1]))) {
                        ed_cnt++;
                        if(st_cnt == ed_cnt) {
                            sp_idx = i;
                        }else {
                            return;
                        }
                    }
                });
                var child = splitted_same_tag.slice(0, sp_idx).join("");
                per(child, obj.tree, obj, self);
                if(matchTag[1] == "title") {
                    self.title = child;
                }
                var bro = splitted_same_tag.slice(sp_idx + 1).join("");
                if(bro !== "") {
                    per(bro, ary, parent, self);
                }
            }else {
                obj.tree = [str];
            }
 
            ary.unshift(obj);
 
            if(obj.tagname == "_Node") {
                Text(obj, obj.tree.join(""));
                function Text(o, txt) {
                    o.innerText += txt;
                    if(o.parent) {
                        Text(o.parent, txt);
                    }
                }
            }else {
                if(!self.tags[obj.tagname]) {
                    self.tags[obj.tagname] = [];
                }
                self.tags[obj.tagname].push(obj);
            }
 
        }
        per(html_str.replace(/<!--[\s\S]*?-->/g, ""), this.html, null, this);
    }
}
 
/*
var Html = HTMLparser(document.body.innerText);
Html
*/
HTMLparser.prototype = {
    search: function(selector) {
        var sel = sel_parse(selector);
        return check_tree(sel[0], this);//1要素まで
 
        function check_tree(s, self) {
            var r = [];
            var _tag = s.tag;
            if(_tag) {
                if(self.tags[_tag]) {
                    for(var t of self.tags[_tag]) {
                        var _atr = t.attr;
                        if(s.class.length) {
                            if(_atr.class) {
                                var _chk = false;
                                for(var cls of s.class) {
                                    if(_atr.class.indexOf(cls) < 0) {
                                        _chk = true;
                                        break;
                                    }
                                }
                                if(_chk) {continue;}
                            }else {
                                continue;
                            }
                        }
                        if(s.id.length) {
                            if(s.id[0] !== _atr.id) {
                                continue;
                            }
                        }
                        if(s.attr.length) {
                            var _chk = false;
                            for(var a of s.attr) {//a: {atr: "class", que: ""}
                                var _k = a.atr;
                                var _m = false;
                                if(_k.match(/(\*|\^|\$)$/)) {
                                    _m = _k.match(/(\*|\^|\$)$/)[1];
                                    _k = _k.replace(/(\*|\^|\$)$/, "");
                                }
                                if(_atr[_k]) {
                                    if(_m) {
                                        var rtxt = a.que.replace(/(\\|\*|\+|\.|\?|\{|\}|\(|\)|\[|\]|\^|\$|\-|\||\/)/g, "\\$1");
                                        if(_m == "*") {
                                            ;
                                        }
                                        else if(_m == "^") {
                                            rtxt = "^" + rtxt;
                                        }
                                        else if(_m == "$") {
                                            rtxt = rtxt + "$";
                                        }
 
                                        if(!_atr[_k].match(new RegExp(rtxt))) {
                                            _chk = true;
                                            break;
                                        }
                                    }else {
                                        if(a.que !== _atr[_k]) {//_atr: {class: Array(1), id: "a"}
                                            if(a.que !== "") {
                                                _chk = true;
                                                break;
                                            }
                                        }
                                    }
                                }else {
                                    _chk = true;
                                    break;
                                }
                            }
                            if(_chk) {
                                continue;
                            }
                        }
                        r.push(t);
                    }
                }
            }
        //{tag: "span", class: ["cls", "cls2"], id: [], attr: [], next: false}
            else {
                var c_ary = [];
                var i_ary = [];
                var a_ary = [];
 
                if(s.class.length) {
                    var _chk = true;
                    var clsList = Object.keys(self.attr.class);
                    for(var cls of s.class) {
                        if(clsList.indexOf(cls) < 0) {
                            break;
                        }else {
                            if(!c_ary.length) {
                                c_ary = self.attr.class[cls];
                            }else {
                                c_ary = c_ary.concat(self.attr.class[cls]).filter(function(x, i, self) {
                                    return self.indexOf(x) === i && i !== self.lastIndexOf(x);
                                });
                            }
                        }
                    }
                }
                if(s.id.length) {
                    var idList = Object.keys(self.attr.id);
                    if(-1 < idList.indexOf(s.id[0])) {
                        i_ary = self.attr.id[s.id[0]];
                    }
                }
                if(s.attr.length) {
                    var _chk = false;
                    for(var a of s.attr) {
                        var _k = a.atr;
                        var _m = false;
                        if(_k.match(/(\*|\^|\$)$/)) {
                            _m = _k.match(/(\*|\^|\$)$/)[1];
                            _k = _k.replace(/(\*|\^|\$)$/, "");console.log(_m + ":" + _k);
                        }
                        if(self.attr[_k]) {
                            if(_m) {
                                var rtxt = a.que.replace(/(\\|\*|\+|\.|\?|\{|\}|\(|\)|\[|\]|\^|\$|\-|\||\/)/g, "\\$1");
                                if(_m == "*") {
                                    ;
                                }
                                else if(_m == "^") {
                                    rtxt = "^" + rtxt;
                                }
                                else if(_m == "$") {
                                    rtxt = rtxt + "$";
                                }
                                Object.keys(self.attr[_k]).forEach(function(_key/*odate, fa-line*/) {
                                    if(_key.match(new RegExp(rtxt))) {
                                        if(!a_ary.length) {
                                            a_ary = self.attr[_k][_key];
                                        }else {
                                            a_ary = a_ary.concat(self.attr[_k][_key]).filter(function(x, i, self) {
                                                return self.indexOf(x) === i && i !== self.lastIndexOf(x);
                                            });//重複したもののみリスト
                                        }
                                    }
                                });
 
                            }else {
                                Object.keys(self.attr[_k]).forEach(function(_key/*odate, fa-line*/) {
                                    if(_key == a.que) {
                                        if(!a_ary.length) {
                                            a_ary = self.attr[_k][_key];
                                        }else {
                                            a_ary = a_ary.concat(self.attr[_k][_key]).filter(function(x, i, self) {
                                                return self.indexOf(x) === i && i !== self.lastIndexOf(x);
                                            });//重複したもののみリスト
                                        }
                                    }
                                });
                            }
                        }else {
                            continue;
                        }
                    }
                }
 
                var full_cnt = 0;
                if(c_ary.length) {
                    full_cnt++;
                    r = r.concat(c_ary);
                }
                if(i_ary.length) {
                    full_cnt++;
                    r = r.concat(i_ary);
                }
                if(a_ary.length) {
                    full_cnt++;
                    r = r.concat(a_ary);
                }
                if(1 < full_cnt) {
                    r = r.filter(function(x, i, self) {
                        return self.indexOf(x) === i && i !== self.lastIndexOf(x);
                    });
                }
 
            }
 
            return r;
 
        }
 
        function sel_parse(sel) {
            if(sel.match(/ ?[\+\~] ?/g)) {
                throw Error('You cannot use Adjacent sibling combinator "+".');
            }
            else if(sel.match(/\:(nth-child\(|nth-of-type\(|not\(|first-child|first-of-type|last-child|last-of-type)/g)) {
                throw Error('You cannot use Pseudo-elements like ":nth-of-type()"');
            }
            var sp = sel.split(/( ?> ?|(?<=[a-zA-Z0-9\]\_\-]) (?=[a-zA-Z\[\.\#\_]))/g);
            var a = [], nxt = false;
            sp.forEach(function(s, idx) {
                if(s.match(/^ $/g)) {
                    return;
                }
                else if(s.match(/^ ?> ?$/g)) {
                    nxt = true;
                    return;
                }
                var _o = {"tag": null, "class": [], "id": [], "attr": [], "next": false};
                if(nxt) {
                    _o.next = true;
                    nxt = false;
                }
                var _s = s.split(/(\[.*?\]|(?<=(?:[a-zA-Z\]]|^))(?:\.|\#)[a-zA-Z\_][a-zA-Z0-9\_\-]*)/g).filter(function(r) {return r;});
                _s.forEach(function(p) {
                    if(p.match(/^\#/)) {
                        _o.id.push(p.replace(/\#/, ""));
                    }
                    else if(p.match(/^\./)) {
                        _o.class.push(p.replace(/\./, ""));
                    }
                    else if(p.match(/^[a-zA-Z]/)) {
                        _o.tag = p;
                    }
                    else if(p.match(/^\[(.*?)(?:\=(?:\"(.*?)\"|\'(.*?)\')|)\]/)) {
                        var _m = p.match(/^\[(.*?)(?:\=(?:\"(.*?)\"|\'(.*?)\')|)\]/);
                        _o.attr.push({"atr": _m[1], "que": _m[2] ? _m[2] : ""});
                    }
                });
                a.unshift(_o);
            });
            return a;
        }
 
    }
}
var Html = HTMLparser(document.body.outerHTML);
Html.search("span");
Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-ShareAlike 3.0 License