import urllib.request
import urllib.parse
 
post_data  = urllib.parse.urlencode({'id':'XXXXXXX''srpla':'XXXXXX'}).encode('UTF-8')
url = urllib.request.Request("https://www.xxx.net/xxxx.do",post_data)
url.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64")
url.add_header("Cookie","webid=x; AGEN=x-tkS8rMRo; SLEVEL=1; TIARA=x-x-5u4SHVdUte-x; webid_sync=x")
 
resp= urllib.request.urlopen(url).read().decode('UTF-8')
 
print (resp)



EX#1) Script - 페이지 링크 목록가져오기

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
 
html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj = BeautifulSoup(html, "html.parser")
for link in bs0bj.findAll("a"):
    if 'href' in link.attrs:
        print(link.attrs['href'])


Ex#1) Result - 페이지 링크 목록가져오기

/wiki/Tomas_Milian
/wiki/D._W._Moffett
/wiki/Dennis_Quaid
/wiki/Peter_Riegert
/wiki/Jacob_Vargas
/wiki/Catherine_Zeta-Jones
/wiki/Screen_Actors_Guild_Award_for_Outstanding_Performance_by_a_Cast_in_a_Motion_Picture
/wiki/Template:ScreenActorsGuildAward_CastMotionPicture_1995%E2%80%932000
/wiki/Template:ScreenActorsGuildAward_CastMotionPicture_2001%E2%80%932010
/wiki/Template:ScreenActorsGuildAward_CastMotionPicture_2011%E2%80%932020
/wiki/Help:Authority_control
https://www.worldcat.org/identities/containsVIAFID/39570812
/wiki/Virtual_International_Authority_File
https://viaf.org/viaf/39570812
/wiki/Library_of_Congress_Control_Number
http://id.loc.gov/authorities/names/n88034930
/wiki/International_Standard_Name_Identifier
...



Ex#2) Script - 데이터 수집

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
 
pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bs0bj = BeautifulSoup(html, "html.parser")
    for link in bs0bj.findAll("a", href=re.compile("^(/wiki/)")):
        if link.attrs['href'not in pages:
            newPage=link.attrs['href']
            print(newPage)
            pages.add(newPage)
            getLinks(newPage)
getLinks("")
 


Ex#2) Result - 데이터 수집

/wiki/Wikipedia
/wiki/Wikipedia:Protection_policy#semi
/wiki/Wikipedia:Requests_for_page_protection
/wiki/Wikipedia:Requests_for_permissions
/wiki/Wikipedia:Requesting_copyright_permission
/wiki/Wikipedia:User_access_levels
/wiki/Wikipedia:Requests_for_adminship
...








from bs4 import BeautifulSoup
import re
from urllib.request import urlopen
 
 
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, "html.parser")
 
images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
for image in images:
    print(image["src"])
 



../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg



EX#1) Script 

#튜플 언패킹
lax_coordinates = (33.9425-119.408056)
city, year, pop, chg, area = ('Tokyo'2003324500.668014)
traveler_ids = [('USA','31195855'), ('BRA','CE342567'), ('ESP','XDA205656')]
 
for passport in sorted(traveler_ids):
    print('%s/%s' % passport )
 
for country, _ in traveler_ids:
    print(country)
 
latitude, longitude = lax_coordinates
print (latitude)
print (longitude)
 


EX#1) Result

BRA/CE342567
ESP/XDA205656
USA/31195855
USA
BRA
ESP
33.9425
-119.408056


EX#2) Script

import os
_, filename = os.path.split('C:/Users/hi080/Desktop/dbeaver-ce-4.2.3-win32.win32.x86_64/dbeaver')
print (filename)
 
filepath, _ = os.path.split('C:/Users/hi080/Desktop/dbeaver-ce-4.2.3-win32.win32.x86_64/dbeaver')
print (filepath)


EX#2) Result

dbeaver
C:/Users/hi080/Desktop/dbeaver-ce-4.2.3-win32.win32.x86_64
 



Script

#보통 리스트 사용
symbols = 'sqladmin'
codes = []
for symbol in symbols:
    codes.append(ord(symbol))
 
print (codes)

Result

C:\Users\hi080\Anaconda3\python.exe C:/Users/hi080/Desktop/Programming/Python/test.py
[11511310897100109105110]



Script

#지능형 리스트
symbols = 'sqladmin'
codes = [ord(symbol) for symbol in symbols]
codes2 = [chr(code) for code in codes]
 
print (codes)
print (codes2)

Result

C:\Users\hi080\Anaconda3\python.exe C:/Users/hi080/Desktop/Programming/Python/test.py
[11511310897100109105110]
['s''q''l''a''d''m''i''n']



from bs4 import BeautifulSoup
import urllib.request
html = urllib.request.urlopen("http://www.pythonscraping.com/pages/warandpeace.html")

bs0bj = BeautifulSoup(html.read(), "html.parser")
nameList=bs0bj.findAll("span",{"class":{"red","green"}})
for name in nameList:
print(name.get_text())

출력 결과

C:\Python27\python.exe C:/Users/dj/Desktop/web_py/web.py

Anna

Pavlovna Scherer

Empress Marya

Fedorovna

Prince Vasili Kuragin

Anna Pavlovna

St. Petersburg

the prince

Anna Pavlovna

Anna Pavlovna

the prince

the prince

the prince

Prince Vasili

Anna Pavlovna

Anna Pavlovna

the prince

Wintzingerode

King of Prussia

le Vicomte de Mortemart

Montmorencys

Rohans

Abbe Morio

the Emperor

the prince

Prince Vasili

Dowager Empress Marya Fedorovna

the baron

Anna Pavlovna

the Empress

the Empress

Anna Pavlovna's

Her Majesty

Baron

Funke

The prince

Anna

Pavlovna

the Empress

The prince

Anatole

the prince

The prince

Anna

Pavlovna

Anna Pavlovna

Process finished with exit code 0


from bs4 import BeautifulSoup
import urllib.request
html = urllib.requeest.urlopen("http://www.pythonscraping.com/pages/warandpeace.html")

bs0bj = BeautifulSoup(html.read(), "html.parser")
nameList = bs0bj.findAll(text="the prince")
print len(nameList)

출력 결과

C:\Python27\python.exe C:/Users/dj/Desktop/web_py/web.py

7

Process finished with exit code 0

+ Recent posts