HackerRank HTML Parser - Part 1 solution in Python

HackerRank HTML Parser – Part 1 solution in Python – In this HTML Parser – Part 1 problem You are given an HTML code snippet of N lines. Your task is to print start tags, end tags, and empty tags separately.

HTML
Hypertext Markup Language is a standard markup language used for creating World Wide Web pages.

Parsing
Parsing is the process of syntactic analysis of a string of symbols. It involves resolving a string into its component parts and describing their syntactic roles.

HTMLParser
An HTMLParser instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are encountered.

Example (from the Python 3 documentation):

_Code

from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        print("Encountered a start tag:", tag)

    def handle_endtag(self, tag):
        print("Encountered an end tag :", tag)

    def handle_data(self, data):
        print("Encountered some data  :", data)

parser = MyHTMLParser()
parser.feed('<html><head><title>Test</title></head>'
            '<body><h1>Parse me!</h1></body></html>')

_Output

Encountered a start tag: html
Encountered a start tag: head
Encountered a start tag: title
Encountered some data  : Test
Encountered an end tag : title
Encountered an end tag : head
Encountered a start tag: body
Encountered a start tag: h1
Encountered some data  : Parse me!
Encountered an end tag : h1
Encountered an end tag : body
Encountered an end tag : html

.handle_starttag(tag, attrs)

This method is called to handle the start tag of an element. (For example: <div class=’marks’>)
The tag argument is the name of the tag converted to lowercase.
The attrs argument is a list of (name, value) pairs containing the attributes found inside the tag’s <> brackets.

.handle_endtag(tag)

This method is called to handle the end tag of an element. (For example: </div>)
The tag argument is the name of the tag converted to lowercase.

.handle_startendtag(tag,attrs)

This method is called to handle the empty tag of an element. (For example: <br />)
The tag argument is the name of the tag converted to lowercase.
The attrs argument is a list of (name, value) pairs containing the attributes found inside the tag’s <> brackets.

HackerRank HTML Parser – Part 1 solution in Python 2.

from HTMLParser import HTMLParser

# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        print "Start :", tag
        for attr in attrs:
            print "->", attr[0], ">", attr[1]
    def handle_endtag(self, tag):
        print "End   :", tag
    def handle_startendtag(self, tag, attrs):
        print "Empty :", tag
        for attr in attrs:
            print "->", attr[0], ">", attr[1]

N = int(input())
parser = MyHTMLParser()

for i in xrange(N):
    parser.feed(raw_input())

HTML Parser – Part 1 solution in Python 3.

# Enter your code here. Read input from STDIN. Print output to STDOUT
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):        
        print ('Start :',tag)
        for ele in attrs:
            print ('->',ele[0],'>',ele[1])
            
    def handle_endtag(self, tag):
        print ('End   :',tag)
        
    def handle_startendtag(self, tag, attrs):
        print ('Empty :',tag)
        for ele in attrs:
            print ('->',ele[0],'>',ele[1])
            
MyParser = MyHTMLParser()
MyParser.feed(''.join([input().strip() for _ in range(int(input()))]))

Problem solution in pypy programming.

from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        if tag == 'comment':
            return None
        print('Start : {0}'.format(tag))
        if attrs:
            for el in attrs:
                print('-> {0} > {1}'.format(el[0], el[1]))
    def handle_endtag(self, tag):
        if tag == 'comment':
            return None
        print('End   : {0}'.format(tag))
    def handle_startendtag(self, tag, attrs):
        if tag == 'comment':
            return None
        print('Empty : {0}'.format(tag))
        if attrs:
            for el in attrs:
                print('-> {0} > {1}'.format(el[0], el[1]))
        
h = MyHTMLParser()
single_html_str = ''
for _ in range(input()):
    single_html_str += raw_input()
h.feed(single_html_str)

Problem solution in pypy3 programming.

# Enter your code here. Read input from STDIN. Print output to STDOUT
import re
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        print ("Start :", tag)
        self.value(attrs)

    def handle_endtag(self, tag):
        print ("End   :", tag)

    def handle_startendtag(self, tag, attrs):
        print ("Empty :", tag)
        self.value(attrs)

    def value(self, attrs = None):
        if attrs:
            [print('->', attr, '>', val) for attr, val, in attrs]

ss = 'n'.join([input() for x in range(int(input()))])
parser = MyHTMLParser()
parser.feed(ss)