简单介绍

百度翻译官网地址为:https://fanyi.baidu.com/
它支持全球28种热门语言互译功能,在以下分析过程中仅仅以中英互译进行,对于其他语种,只需要修改参数即可实现。

翻译接口分析

翻译接口定位

打开 https://fanyi.baidu.com/ 链接输入英文 friend 进行翻译 , 通过浏览器F12控制台进行数据抓取:

整个翻译过程主要做了两件是:
1、调用https://fanyi.baidu.com/langdetect接口检测要翻译的语言,返回结果为:{“error”:0,”msg”:”success”,”lan”:”en”} 表示输入语言为英文。
2、调用https://fanyi.baidu.com/v2transapi接口进行翻译,接口参数中的from是第一步获取到的类型。

返回结果为json字符串,这里因字符数限制不便展开,文末将以附件的形式提供。

翻译接口分析

https://fanyi.baidu.com/v2transapi?from=en&to=zh&query=friend&transtype=translang&simple_means_flag=3&sign=354521.101352&token=8f0ab24debcd7841d11da01d58ea2148

以上翻译接口请求的关键参数为:
from:翻译源语言
to:翻译后的语言
query:要翻译的内容
sign:签名
token:token
所以要实现翻译则需要生成该接口对应的参数,以下将分析如何生成接口参数。

要查找到以上信息,我们需要查找到哪个js调用了https://fanyi.baidu.com/v2transapi请求。所以通过sources查找,最终找到ajax请求位于代码index_c8a141d.js如图:

获取js代码,并格式化后信息如下:

以上可以看出token参数位于:window.common.token,这个在https://fanyi.baidu.com/translate网页html源码中即可查到。

而关键参数sign签名则需要继续分析m函数,m函数对应于m = t(“translation:widget/translate/input/pGrab”)

function(r, o, t) {
    "use strict";
    function a(r) {
        if (Array.isArray(r)) {
            for (var o = 0,
            t = Array(r.length); o < r.length; o++) t[o] = r[o];
            return t
        }
        return Array.from(r)
    }
    function n(r, o) {
        for (var t = 0; t < o.length - 2; t += 3) {
            var a = o.charAt(t + 2);
            a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
            a = "+" === o.charAt(t + 1) ? r >>> a: r << a,
            r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
        }
        return r
    }
    function e(r) {
        var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
        if (null === o) {
            var t = r.length;
            t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr( - 10, 10))
        } else {
            for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
            C !== h - 1 && f.push(o[C]);
            var g = f.length;
            g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice( - 10).join(""))
        }
        var u = void 0,
        l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
        u = null !== i ? i: (i = window[l] || "") || "";
        for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
            var A = r.charCodeAt(v);
            128 > A ? S[c++] = A: (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)), S[c++] = A >> 18 | 240, S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224, S[c++] = A >> 6 & 63 | 128), S[c++] = 63 & A | 128)
        }
        for (var p = m,
        F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++) p += S[b],
        p = n(p, F);
        return p = n(p, D),
        p ^= s,
        0 > p && (p = (2147483647 & p) + 2147483648),
        p %= 1e6,
        p.toString() + "." + (p ^ m)
    }
    var i = null;
    t.exports = e
});;

对于以上函数,最终调用的是e函数,分析e函数,其中关键代码如下:

var u = void 0,
l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i: (i = window[l] || "") || "";

对照ASCII表可得l = “” + “g”+”t” + ”k”所以u = window[“gtk”]; 该值可以通过console查看:

同时也可以通过网页源码查看获取:

<script>window.bdstoken = '';window.gtk = '320305.131321201';</script>

至此,整个百度翻译过程已经完整分析清楚了。

爬虫脚本实现

1、sign参数获取

因为sign参数获取需要调用js代码生成,所以需要拷贝改造百度js脚本,最终修改完baidufanyi.js脚本源码为:

function a(r) {
    if (Array.isArray(r)) {
        for (var o = 0,
        t = Array(r.length); o < r.length; o++) t[o] = r[o];
        return t
    }
    return Array.from(r)
}
function n(r, o) {
    for (var t = 0; t < o.length - 2; t += 3) {
        var a = o.charAt(t + 2);
        a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
        a = "+" === o.charAt(t + 1) ? r >>> a: r << a,
        r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
    }
    return r
}
//@happiren
//修改e函数,添加一个参数u,该参数对应于windows.gtk
function e(r, u) {
    var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
    if (null === o) {
        var t = r.length;
        t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr( - 10, 10))
    } else {
        for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
        C !== h - 1 && f.push(o[C]);
        var g = f.length;
        g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice( - 10).join(""))
    }
    //@happiren 因为修改参数输入,则以下三行源码需要删除
    //var u = void 0,
    //l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
    //u = null !== i ? i: (i = window[l] || "") || "";
    for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
        var A = r.charCodeAt(v);
        128 > A ? S[c++] = A: (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)), S[c++] = A >> 18 | 240, S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224, S[c++] = A >> 6 & 63 | 128), S[c++] = 63 & A | 128)
    }
    for (var p = m,
    F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++) p += S[b],
    p = n(p, F);
    return p = n(p, D),
    p ^= s,
    0 > p && (p = (2147483647 & p) + 2147483648),
    p %= 1e6,
    p.toString() + "." + (p ^ m)
}
var i = null;

2、获取cookie

def get_fanyi_cookie():
    url = "https://fanyi.baidu.com";
    response = requests.get(url);
    return requests.utils.dict_from_cookiejar(response.cookies)

3、获取token、 gtk参数

def prepare_param(cookieDict):
    url = "https://fanyi.baidu.com/translate";
    headers = {
            'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6",
            'User-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36",

        }
    if len(cookieDict) > 0:
        cookie = "";
        for key in cookieDict:
            cookie = cookie + key+"="+cookieDict[key]+";";
        headers['cookie'] = cookie
    response = requests.get(url, headers=headers);
    html = response.text;
    windows_gtk = re.findall(";window.gtk = (.*?);</script>", html)[0][1:-1];
    token = re.findall(r"token: '(.*?)',", html)[0];
    #print html;
    return token, windows_gtk;

4、调用翻译接口翻译

def translate(query, fro, to, cookieDict):
    node = execjs.get();
    file = "baidufanyi.js";
    jsScript = open(file, "r").read(); #encoding='utf-8'
    ctx = node.compile(jsScript);
    token, u = prepare_param(cookieDict);
    js = 'e("{0}", "{1}")'.format(query, u);
    sign = ctx.eval(js);

    #经过测试header中cookie必须添加BAIDUID方可正常使用
    headers = {
        'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6",
        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'User-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36",

    }
    #添加cookie
    if len(cookieDict) > 0:
        cookie = "";
        for key in cookieDict:
            cookie = cookie + key+"="+cookieDict[key]+";";
        headers['cookie'] = cookie

    data = {
        'from':fro,
        'to': to,
        "query": query,
        "transtype": 'translang',
        "simple_meas_flag": '3',
         "sign": sign,
        'token': token
    }
    url = "https://fanyi.baidu.com/v2transapi"
    response = requests.post(url, data=data, headers=headers)
    if response.status_code == 200:
        return response.content;
    return None;

以上关键部分代码为通过execjs模块调用js代码生成sign签名:sign = ctx.eval(js);

5、调用函数翻译

baiduCookies  = get_fanyi_cookie();
content = translate("friend",'en', 'zh', baiduCookies)
print content
json = json.loads(content);

至此,整个翻译脚本已经实现

PS: github源码