html_validator/HTML_Validator.py at master · dwang862/html_validator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/bin/python3


def validate_html(html):
    '''
    This function performs a limited version of html validation by checking
    whether every opening tag has a corresponding closing tag.

    >>> validate_html('<strong>example</strong>')
    True
    >>> validate_html('<strong>example')
    False
    '''

    # HINT:
    # use the _extract_tags function below to generate a list of
    # html tags without any extra text;
    # then process these html tags using the balanced parentheses
    # algorithm from the class/book
    # the main difference between your code and
    # the code from class will be that
    # you will have to keep track of not just the 3 types of parentheses,
    # but arbitrary text located between the html tags
    tags = _extract_tags(html)
    stack = []
    if html == '':
        return True
    elif tags == []:
        return False
    else:
        for tag in tags:
            if '/' not in tag:
                stack.append(tag)
            else:
                if len(stack) == 0:
                    return False
                end = tag[2:-1]
                beg = stack[-1][1:-1]
                if beg == end:
                    stack.pop()
        if len(stack) == 0:
            return True
        else:
            return False


def _extract_tags(html):
    '''
    This is a helper function for `validate_html`.
    By convention in Python, helper functions that are not meant to be
    used directly by the user are prefixed with an underscore.
    This function returns a list of all the html tags
    contained in the input string,
    stripping out all text not contained within angle brackets.

    >>> _extract_tags('Python <strong>rocks</strong>!')
    ['<strong>', '</strong>']
    '''
    taglist = []
    word = ''
    openbracket = False
    for i in range(len(html)):
        if html[i] == '<':
            openbracket = True
            word = word + html[i]
        elif html[i] == '>':
            openbracket = False
            word = word + html[i]
            taglist.append(word)
            word = ''
        else:
            if openbracket is True:
                word = word + html[i]
    return taglist