jsonflow
A Crawling Framework Based on Data Flow and Decorators(查看中文版)
Before Use
pip install -r requirements.txt
Get Started
@src: Fetch Resources
- Fetch a single page
from jsonflow.core import jf
@jf.src('https://en.wikipedia.org/wiki/wiki/python')
def download_page():
with open('python.html', 'wb') as f:
f.write(jf.data.encode('utf-8')) # use get_data to fetch data from jsonflow container
if __name__ == '__main__':
download_page()
- Use parameters
from jsonflow.core import jf
# Use the parameter "name" of function "download_page" as the url suffix
@jf.src('https://en.wikipedia.org/wiki/<name>')
def download_page(name):
with open(name + '.html', 'wb') as f:
f.write(jf.data.encode('utf-8'))
if __name__ == '__main__':
for keyword in ['c++', 'python', 'java', 'c#', 'javascript']:
download_page(name=keyword) # should be keyword arguments here
@flow: The Data Flow
from jsonflow.core import jf
from bs4 import BeautifulSoup
@jf.src('https://en.wikipedia.org/wiki/<name>')
@jf.flow(
lambda data : BeautifulSoup(data, 'lxml'),
lambda soup : soup.title.text)
def get_title(name):
return jf.data
if __name__ == '__main__':
for keyword in ['c++', 'python', 'java', 'c#', 'javascript']:
title = get_title(name=keyword)
print(title)
'''
C++ - Wikipedia
Python - Wikipedia
Java - Wikipedia
C - Wikipedia
JavaScript - Wikipedia
'''
@flow: High Level Usage
- list
from hashlib import md5
from jsonflow.core import jf
from bs4 import BeautifulSoup
# get md5
def digest(s):
return md5(s.encode()).hexdigest()
@jf.src('https://en.wikipedia.org/wiki/<name>')
@jf.flow(
lambda data : BeautifulSoup(data, 'lxml'),
lambda soup : soup.title.text,
[len, digest])
# equals to lambda title : [len(title), digest(title)]
# also equals to [lambda title : len(title), lambda title : digest(title)]
def get_title_length_and_md5(name):
return jf.data
if __name__ == '__main__':
for keyword in ['c++', 'python', 'java', 'c#', 'javascript']:
data = get_title_length_and_md5(name=keyword)
print(data)
'''
[15, 'b47f94ac21757616e99c4256320236c3']
...
'''
- dict
@jf.src('https://en.wikipedia.org/wiki/<name>')
@jf.flow(
lambda data : BeautifulSoup(data, 'lxml'),
lambda soup : soup.title.text,
{'length': len, 'md5': digest})
# equals to lambda title : {'length': len(title), 'md5': digest(title)}
# also equals to {'length': lambda title : len(title), 'md5': lambda title : digest(title)}
def get_title_length_and_md5(name):
return jf.data
if __name__ == '__main__':
for keyword in ['c++', 'python', 'java', 'c#', 'javascript']:
data = get_title_length_and_md5(name=keyword)
print(data)
'''
{'length': 15, 'md5': 'b47f94ac21757616e99c4256320236c3'}
...
'''
- nesting
@jf.src('https://en.wikipedia.org/wiki/<name>')
@jf.flow(
lambda data : BeautifulSoup(data, 'lxml'),
lambda soup : soup.title.text,
{'info': [len, digest]})
def get_title_length_and_md5(name):
return jf.data
if __name__ == '__main__':
for keyword in ['c++', 'python', 'java', 'c#', 'javascript']:
data = get_title_length_and_md5(name=keyword)
print(data)
'''
{'info': [15, 'b47f94ac21757616e99c4256320236c3']}
...
'''
- use parameters
@jf.src('https://en.wikipedia.org/wiki/<name>')
@jf.flow(
lambda data : BeautifulSoup(data, 'lxml'),
lambda soup : soup.title.text,
{'<name>': [len, digest]}) # 以参数name作为key
def get_title_length_and_md5(name):
return jf.data
if __name__ == '__main__':
for keyword in ['c++', 'python', 'java', 'c#', 'javascript']:
data = get_title_length_and_md5(name=keyword)
print(data)
'''
{'c++': [15, 'b47f94ac21757616e99c4256320236c3']}
...
'''
Template
The template statement writes between the angle brackets, for example <name>
, which is able to access all parameters of the decorated function and registered external variables, as well as use python expressions directly.
Python Expression
@jf.src('https://en.wikipedia.org/wiki/<name>')
@jf.flow(
lambda data : BeautifulSoup(data, 'lxml'),
lambda soup : soup.title.text,
{'<name + "-" + str(len(name))>': [len, digest]})
def get_title_length_and_md5(name):
return jf.data
if __name__ == '__main__':
for keyword in ['c++', 'python', 'java', 'c#', 'javascript']:
data = get_title_length_and_md5(name=keyword)
print(data)
'''
{'c++-3': [15, 'b47f94ac21757616e99c4256320236c3']}
...
'''
Escape
The less-than and greater-than symbol should be escaped in xml/html style
@jf.src('https://en.wikipedia.org/wiki/<name>')
@jf.flow(
lambda data : BeautifulSoup(data, 'lxml'),
lambda soup : soup.title.text,
{'<name + "-" + str(len(name) < 3)>': [len, digest]}) # escape
def get_title_length_and_md5(name):
return jf.data
if __name__ == '__main__':
for keyword in ['c++', 'python', 'java', 'c#', 'javascript']:
data = get_title_length_and_md5(name=keyword)
print(data)
'''
{'c++-False': [15, 'b47f94ac21757616e99c4256320236c3']}
...
'''
Register External Variables
from jsonflow.core import jf
# Register variables
jf.prefix = 'lang-'
jf.suffix = '-info'
@jf.src('https://en.wikipedia.org/wiki/<name>')
@jf.flow(
lambda data : BeautifulSoup(data, 'lxml'),
lambda soup : soup.title.text,
{'<self.prefix + name + self.suffix>': [len, digest]}) # access through "self"
def get_title_length_and_md5(name):
return jf.data
if __name__ == '__main__':
for keyword in ['c++', 'python', 'java', 'c#', 'javascript']:
data = get_title_length_and_md5(name=keyword)
print(data)
'''
{'lang-c++-info': [15, 'b47f94ac21757616e99c4256320236c3']}
...
'''
Cookie Inheritence
In this section, you can use test_server.py
as a test service in order to learn how to use cookie inheritence.
pip install flask
python test_server.py
- The response cookie of the upper layer @src is inherited by the lower ones as their request cookies
@jf.src(
'http://localhost:8000/login',
method='post',
data = {
'username': 'Liadrinz',
'password': '123456'
}
)
@jf.src('http://localhost:8000/test_data', inherit_cookies=True)
def get_greet():
return jf.data
- Nearby Principle: the @src decorators whose inherit_cookies equals True inherit from the nearest @src whose inherit_cookies equals False
@jf.src(
'http://localhost:8000/login',
method='post',
data = {
'username': 'Liadrinz',
'password': '123456'
}
)
@jf.src('http://localhost:8000/test_data', inherit_cookies=True)
@jf.src('http://localhost:8000/test_data', inherit_cookies=True)
@jf.src(
'http://localhost:8000/login',
method='post',
data = {
'username': 'StevenZ',
'password': '123456'
}
)
@jf.src('http://localhost:8000/test_data', inherit_cookies=True)
@jf.src('http://localhost:8000/test_data', inherit_cookies=True)
def get_greet():
return jf.data
- Login Demo
@jf.src(
'http://localhost:8000/login',
method='post',
data = {
'username': 'Liadrinz',
'password': '123456'
}
)
def login():
return get_data()
@jf.src('http://localhost:8000/test_data', inherit_cookies=True)
def get_greet():
return get_data()
if __name__ == '__main__':
login()
data = get_greet()
print(data)
@thread: Multithreading
from jsonflow.core import jf
config.max_workers = 4 # max number of workers
# register external variables
jflow.prefix = 'lang-'
jflow.suffix = '-info'
@jf.thread(callback=lambda data : print(data)) # access data through callbacks
@jf.src('https://en.wikipedia.org/wiki/<name>')
@jf.flow(
lambda data : BeautifulSoup(data, 'lxml'),
lambda soup : soup.title.text,
{'<self.prefix + name + self.suffix>': [len, digest]}) # access through "self"
def get_title_length_and_md5(name):
return jf.data
if __name__ == '__main__':
for keyword in ['c++', 'python', 'java', 'c#', 'javascript']:
get_title_length_and_md5(name=keyword)
'''
...
{'lang-c++-info': [15, 'b47f94ac21757616e99c4256320236c3']}
...
'''
jf.wait() # wait for the thread pool