Nginx如何设置才能精准识别并拦截无用的爬虫，只保留Googlebot（谷歌）、Bingbot（必应）、Baiduspider（百度）等少数几个真实有用的爬虫？

当前位置：点晴教程→知识管理交流 →『技术文档交流』

admin

2025年9月25日 10:14 本文热度 704

可以通过Nginx的$http_user_agent变量结合IP验证来实现精准识别。以下是完整的配置方案：

1. 基础User-Agent过滤配置

nginx

# 在http或server块中定义允许的爬虫映射
map $http_user_agent $allowed_bot {
    default 0;
    
    # Google爬虫
    "~*googlebot" 1;
    "~*Googlebot-News" 1;
    "~*Googlebot-Image" 1;
    "~*Googlebot-Video" 1;
    "~*Mediapartners-Google" 1;
    "~*AdsBot-Google" 1;
    
    # 百度爬虫
    "~*baiduspider" 1;
    "~*Baiduspider-image" 1;
    "~*Baiduspider-news" 1;
    "~*Baiduspider-video" 1;
    
    # 必应爬虫
    "~*bingbot" 1;
    "~*adidxbot" 1;
    
    # 其他需要允许的爬虫
    "~*Twitterbot" 1;
    "~*facebookexternalhit" 1;
    "~*Slackbot" 1;
}
# 拦截非允许的爬虫
server {
    listen 80;
    server_name yourdomain.com;
    
    # 拦截非允许的User-Agent
    if ($allowed_bot = 0) {
        # 检查是否包含爬虫关键词但不在白名单中
        if ($http_user_agent ~* (bot|spider|crawler|scraper)) {
            return 444;  # 直接关闭连接
            # 或者返回403：return 403 "Access Denied";
        }
    }
    
    # 其他正常配置...
    location / {
        # 你的正常配置
    }
}

2. 增强版：IP验证 + User-Agent双重验证

nginx

# 定义官方爬虫IP段（需要定期更新）
geo $valid_bot_ip {
    default 0;
    
    # Google官方IP段（示例，需要定期更新）
    66.249.64.0/19 1;
    64.233.160.0/19 1;
    66.102.0.0/20 1;
    74.125.0.0/16 1;
    
    # 百度官方IP段
    180.76.0.0/16 1;
    119.63.196.0/16 1;
    123.125.71.0/16 1;
    
    # 必应官方IP段
    157.55.0.0/16 1;
    207.46.0.0/16 1;
    65.52.0.0/14 1;
}
# 增强版爬虫验证
map "$http_user_agent:$valid_bot_ip" $is_valid_bot {
    default 0;
    "~*googlebot:1" 1;
    "~*baiduspider:1" 1;
    "~*bingbot:1" 1;
    # 添加其他需要验证的爬虫...
}
server {
    listen 80;
    server_name yourdomain.com;
    
    # 严格的爬虫验证
    if ($http_user_agent ~* (bot|spider|crawler|scraper)) {
        if ($is_valid_bot = 0) {
            # 记录可疑爬虫访问
            access_log /var/log/nginx/bot_access.log;
            
            # 根据严重程度处理
            if ($http_user_agent ~* (ahrefs|semrush|mj12bot)) {
                return 444;  # 已知恶意爬虫直接阻断
            }
            
            # 其他可疑爬虫限流或返回假数据
            limit_req zone=bot_limit burst=5 nodelay;
            return 200 "正常页面内容";  # 可以返回简化版页面
        }
    }
    
    location / {
        # 正常请求处理
    }
}
# 爬虫限流区域
limit_req_zone $binary_remote_addr zone=bot_limit:10m rate=1r/s;

3. 针对特定路径的爬虫控制

nginx

# 对robots.txt和sitemap特殊处理
location = /robots.txt {
    # 允许所有爬虫访问robots.txt
    add_header Content-Type text/plain;
    return 200 "User-agent: *\nDisallow: /admin/\nDisallow: /private/\nSitemap: https://yourdomain.com/sitemap.xml";
}
# 对sitemap文件特殊处理
location ~* \.(xml|txt)$ {
    if ($http_user_agent ~* (bot|spider|crawler)) {
        if ($is_valid_bot = 0) {
            return 403;
        }
    }
    # 正常处理sitemap文件
}

4. 完整的配置示例

nginx

http {
    # 爬虫IP白名单（需要定期更新）
    geo $bot_ip_whitelist {
        default 0;
        include /etc/nginx/conf.d/bot-ips.conf;  # 外部IP列表文件
    }
    
    # 允许的爬虫User-Agent映射
    map $http_user_agent $allowed_bot_ua {
        default 0;
        "~*(googlebot|baiduspider|bingbot|twitterbot|facebookexternalhit)" 1;
    }
    
    # 恶意爬虫特征
    map $http_user_agent $bad_bot {
        default 0;
        "~*(ahrefs|semrush|mj12bot|dotbot|petalbot)" 1;
    }
    
    server {
        listen 80;
        server_name example.com;
        
        # 恶意爬虫直接阻断
        if ($bad_bot = 1) {
            return 444;
        }
        
        # 可疑爬虫验证
        if ($http_user_agent ~* (bot|spider|crawler)) {
            set $bot_check "A";
            
            # 检查是否在白名单中
            if ($allowed_bot_ua = 1) {
                set $bot_check "${bot_check}B";
            }
            
            # 检查IP是否在白名单中（可选，严格模式）
            if ($bot_ip_whitelist = 1) {
                set $bot_check "${bot_check}C";
            }
            
            # 如果不在白名单中
            if ($bot_check = "A") {
                access_log /var/log/nginx/suspicious_bots.log;
                limit_req zone=bot_limit burst=3 nodelay;
                
                # 返回假数据或拦截
                return 200 "<!DOCTYPE html><html><head><title>网站维护中</title></head><body></body></html>";
            }
        }
        
        location / {
            # 正常业务逻辑
            try_files $uri $uri/ /index.html;
        }
        
        # 静态资源允许所有访问
        location ~* \.(jpg|jpeg|png|gif|ico|css|js)$ {
            expires 1y;
            add_header Cache-Control "public, immutable";
        }
    }
    
    # 爬虫限流配置
    limit_req_zone $binary_remote_addr zone=bot_limit:10m rate=1r/m;
}

5. 维护和更新

创建IP列表文件 /etc/nginx/conf.d/bot-ips.conf：

text

# Google IP段
66.249.64.0/19 1;
64.233.160.0/19 1;
# 百度 IP段  
180.76.0.0/16 1;
119.63.196.0/16 1;
# 必应 IP段
157.55.0.0/16 1;
207.46.0.0/16 1;

定期更新脚本：

bash

#!/bin/bash
# 更新爬虫IP列表
wget -O /tmp/google_ips.txt https://www.google.com/ipranges/goog.txt
# 处理获取的IP段并更新配置文件

注意事项

User-Agent可以被伪造，所以IP验证很重要
定期更新IP段，官方爬虫IP会变化
监控日志，及时发现新的恶意爬虫
避免误杀，测试时先从宽松开始
考虑性能影响，复杂的匹配规则会增加CPU负担

这种配置可以有效拦截大多数无用爬虫，同时确保搜索引擎正常收录。

该文章在 2025/9/25 10:14:07 编辑过