简单介绍
百度翻译官网地址为:https://fanyi.baidu.com/
它支持全球28种热门语言互译功能,在以下分析过程中仅仅以中英互译进行,对于其他语种,只需要修改参数即可实现。
翻译接口分析
翻译接口定位
打开 https://fanyi.baidu.com/ 链接输入英文 friend 进行翻译 , 通过浏览器F12控制台进行数据抓取:
整个翻译过程主要做了两件是:
1、调用https://fanyi.baidu.com/langdetect接口检测要翻译的语言,返回结果为:{“error”:0,”msg”:”success”,”lan”:”en”} 表示输入语言为英文。
2、调用https://fanyi.baidu.com/v2transapi接口进行翻译,接口参数中的from是第一步获取到的类型。
返回结果为json字符串,这里因字符数限制不便展开,文末将以附件的形式提供。
翻译接口分析
https://fanyi.baidu.com/v2transapi?from=en&to=zh&query=friend&transtype=translang&simple_means_flag=3&sign=354521.101352&token=8f0ab24debcd7841d11da01d58ea2148
以上翻译接口请求的关键参数为:
from:翻译源语言
to:翻译后的语言
query:要翻译的内容
sign:签名
token:token
所以要实现翻译则需要生成该接口对应的参数,以下将分析如何生成接口参数。
要查找到以上信息,我们需要查找到哪个js调用了https://fanyi.baidu.com/v2transapi请求。所以通过sources查找,最终找到ajax请求位于代码index_c8a141d.js如图:
获取js代码,并格式化后信息如下:
以上可以看出token参数位于:window.common.token,这个在https://fanyi.baidu.com/translate网页html源码中即可查到。
而关键参数sign签名则需要继续分析m函数,m函数对应于m = t(“translation:widget/translate/input/pGrab”)
function(r, o, t) {
"use strict";
function a(r) {
if (Array.isArray(r)) {
for (var o = 0,
t = Array(r.length); o < r.length; o++) t[o] = r[o];
return t
}
return Array.from(r)
}
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a: r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr( - 10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice( - 10).join(""))
}
var u = void 0,
l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i: (i = window[l] || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A: (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)), S[c++] = A >> 18 | 240, S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224, S[c++] = A >> 6 & 63 | 128), S[c++] = 63 & A | 128)
}
for (var p = m,
F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++) p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
var i = null;
t.exports = e
});;
对于以上函数,最终调用的是e函数,分析e函数,其中关键代码如下:
var u = void 0,
l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i: (i = window[l] || "") || "";
对照ASCII表可得l = “” + “g”+”t” + ”k”所以u = window[“gtk”]; 该值可以通过console查看:
同时也可以通过网页源码查看获取:
<script>window.bdstoken = '';window.gtk = '320305.131321201';</script>
至此,整个百度翻译过程已经完整分析清楚了。
爬虫脚本实现
1、sign参数获取
因为sign参数获取需要调用js代码生成,所以需要拷贝改造百度js脚本,最终修改完baidufanyi.js脚本源码为:
function a(r) {
if (Array.isArray(r)) {
for (var o = 0,
t = Array(r.length); o < r.length; o++) t[o] = r[o];
return t
}
return Array.from(r)
}
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a: r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}
//@happiren
//修改e函数,添加一个参数u,该参数对应于windows.gtk
function e(r, u) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr( - 10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice( - 10).join(""))
}
//@happiren 因为修改参数输入,则以下三行源码需要删除
//var u = void 0,
//l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
//u = null !== i ? i: (i = window[l] || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A: (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)), S[c++] = A >> 18 | 240, S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224, S[c++] = A >> 6 & 63 | 128), S[c++] = 63 & A | 128)
}
for (var p = m,
F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++) p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
var i = null;
2、获取cookie
def get_fanyi_cookie():
url = "https://fanyi.baidu.com";
response = requests.get(url);
return requests.utils.dict_from_cookiejar(response.cookies)
3、获取token、 gtk参数
def prepare_param(cookieDict):
url = "https://fanyi.baidu.com/translate";
headers = {
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6",
'User-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36",
}
if len(cookieDict) > 0:
cookie = "";
for key in cookieDict:
cookie = cookie + key+"="+cookieDict[key]+";";
headers['cookie'] = cookie
response = requests.get(url, headers=headers);
html = response.text;
windows_gtk = re.findall(";window.gtk = (.*?);</script>", html)[0][1:-1];
token = re.findall(r"token: '(.*?)',", html)[0];
#print html;
return token, windows_gtk;
4、调用翻译接口翻译
def translate(query, fro, to, cookieDict):
node = execjs.get();
file = "baidufanyi.js";
jsScript = open(file, "r").read(); #encoding='utf-8'
ctx = node.compile(jsScript);
token, u = prepare_param(cookieDict);
js = 'e("{0}", "{1}")'.format(query, u);
sign = ctx.eval(js);
#经过测试header中cookie必须添加BAIDUID方可正常使用
headers = {
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6",
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36",
}
#添加cookie
if len(cookieDict) > 0:
cookie = "";
for key in cookieDict:
cookie = cookie + key+"="+cookieDict[key]+";";
headers['cookie'] = cookie
data = {
'from':fro,
'to': to,
"query": query,
"transtype": 'translang',
"simple_meas_flag": '3',
"sign": sign,
'token': token
}
url = "https://fanyi.baidu.com/v2transapi"
response = requests.post(url, data=data, headers=headers)
if response.status_code == 200:
return response.content;
return None;
以上关键部分代码为通过execjs模块调用js代码生成sign签名:sign = ctx.eval(js);
5、调用函数翻译
baiduCookies = get_fanyi_cookie();
content = translate("friend",'en', 'zh', baiduCookies)
print content
json = json.loads(content);
至此,整个翻译脚本已经实现
PS: github源码