# -*- coding: iso-8859-2 -*- import sys reload(sys) sys.setdefaultencoding("iso-8859-2") #^^^ varázslás from lxml import etree #doc: http://codespeak.net/lxml/ def c1(h): #/a/b/c minden a/b uton elerheto c print h.xpath("/html/body/p")[1].text #print html.xpath("html/body/p").__class__ #-> list def inner_html(xpath): return etree.tostring(xpath, pretty_print=True, method="html", encoding='iso-8859-2') def c1_1(h): print inner_html(h.xpath('/html/body/p')[1]) def c2(h): for para in h.xpath("/html/body/p"): print "Uj bekezdes:" print para.text def c3(h): # a//p : a-val kezdodo akarmilyen utvonal utani p-k for para in h.xpath("//p"): print "Uj bekezdes:" print inner_html(para) def c4(h): # p/* : mindenki, aki p alatt van (kozvetlenul) for s in h.xpath("/html/body/*"): print "Uj cucc:" print inner_html(s) def c4_1(h): for s in h.xpath("/html/body/ul/*"): print "Új cucc:" print inner_html(s) def c4_2(h): # 2 lepesben elerheto ul-ek for s in h.xpath("/*/*/ul"): print "Uj cucc:" print inner_html(s) def c1_2(h): print inner_html(h.xpath("/html/body/p")[-1]) # mint c1_2, csak XPath-al (de akkor xpath listat ad vissza, azert kell a [0] (majd a kovetkezoben is)) def c1_3(h): print inner_html(h.xpath("/html/body/p[last()]")[0]) # mint c1, csak XPath-al (es az 1-tol, nem 0-tol szamol) def c1_4(h): print h.xpath("/html/body/p[1]")[0].text def c5(h): # [@attr=value] for s in h.xpath("//a[@href='http://math.bme.hu']"): print "Uj link:" print inner_html(s) def c5_0(h): for s in h.xpath("//hr"): print "Uj hr:" print s.get('size') def c5_1(h): # [@attr] attr előfordul for s in h.xpath("//hr[@noshade]"): print "Uj hr:" print s.get('size') def c5_2(h): # [not(@attr)] attr nem fordul elő for s in h.xpath("//hr[not(@noshade)]"): print "Uj hr:" print s.get('size') parser = etree.HTMLParser() html = etree.parse("./test.html", parser) """ Megj.: ha nem file-ban, hanem egy stringben van a parse-olando html, akkor az elozo sor helyett html = etree.parse(StringIO.StringIO(string),parser) de ehhez elobb kell: import StringIO """ print "======================== c1 ========================" c1(html) print "======================== c1_1 ========================" c1_1(html) print "======================== c2 ========================" c2(html) print "======================== c3 ========================" c3(html) print "======================== c4 ========================" c4(html) print "======================== c4_1 ========================" c4_1(html) print "======================== c4_2 ========================" c4_2(html) print "======================== c1_2 ========================" c1_2(html) print "======================== c1_3 ========================" c1_3(html) print "======================== c1_4 ========================" c1_4(html) print "======================== c5 ========================" c5(html) print "======================== c5_0 ========================" c5_0(html) print "======================== c5_1 ========================" c5_1(html) print "======================== c5_2 ========================" c5_2(html)