# robots.txt v3.9 for Tripter # # Please note: There are a lot of pages on this site, and there are # some misbehaved spiders out there that go too fast. If you're # irresponsible, your access to the site may be blocked. # === SLOW DOWN === User-agent: IRLbot Crawl-delay: 720 # http://irl.cs.tamu.edu/crawler/ User-Agent: Slurp Disallow: Crawl-delay: 30 # 2007-06-15: 90 to 30 # http://help.yahoo.com/help/us/ysearch/slurp User-Agent: msnbot Disallow: Crawl-delay: 10 # 2007-06-15: 30 to 10 # http://search.msn.com/msnbot.htm # http://search.msn.com.my/docs/siteowner.aspx User-Agent: msnbot-media Disallow: Crawl-delay: 90 # Seems unlikely that msn's MEDIA bot is controlled separately using this user-agent name, but am giving it a try. User-agent: searchpreview Disallow:/ # MSN Search Preview for page thumbnails. Is this the "msnbot-media" agent? # http://search.msn.com/docs/siteowner.aspx?t=SEARCH_WEBMASTER_PROC_PreventPreview.htm User-Agent: Teoma Disallow: Crawl-delay: 30 # 2007-06-15: 60 to 30 # http://about.ask.com/en/docs/about/webmasters.shtml User-Agent: Yandex Disallow: Crawl-delay: 30 # 2007-06-15: 60 to 30 # If they use that user-agent name on one of their own servers, maybe it works. # (see http://yaca.yandex.ru/robots.txt) # Yodao obeys Disallow, but probably not Crawl-delay User-Agent: YodaoBot Disallow: / # Holmes (for Onet.pl) does not indicate it's User-Agent Name. Little info is available. For example, # http://www.internetofficer.com/web-robot/onetszukaj/ # has no details. Trying several user-agent names: User-Agent: Onet Disallow: Crawl-delay: 180 User-Agent: onet Disallow: Crawl-delay: 180 User-Agent: onet.pl Disallow: Crawl-delay: 180 User-Agent: OnetSzukaj Disallow: Crawl-delay: 180 User-Agent: Holmes Disallow: Crawl-delay: 180 User-Agent: holmes Disallow: Crawl-delay: 180 User-Agent: baiduspider Disallow: Crawl-delay: # 2007-06-15: 60 > 10 # 2007-07-21: 10 > 0 # http://www.baidu.com/search/spider.htm # http://www.baidu.com/search/robots.html User-Agent: Charlotte Disallow: Crawl-delay: 600 # "Stealth-mode startup" that crawls a bit fast. Replied to e-mail. Respects robots.txt and Crawl-delay. # http://www.betaspider.com/ User-agent: MJ12bot Disallow: / Crawl-Delay: # MJ12 was downloading 658 pages/hour in January 2008. # Info page says maximum crawl-delay is 20 seconds, which is still too fast (300 pages/hour/site) so I'm forced to ban. If it abides, consider setting a crawl-delay of 20-600 and see what happens. # http://www.majestic12.co.uk/projects/dsearch/mj12bot.php#WhatIsIt # === GOOGLEBOT === # Note: Googlebot does not recognize or obey "Crawl-delay". Instead, use WebMaster Control Panel: # https://www.google.com/webmasters/sitemaps/siteoverview # http://www.google.com/support/webmasters/bin/topic.py?topic=8843 User-Agent: Googlebot Disallow: # !!! DISALLOWING: (none) # Disallow: /hot%C3%A9is/ # pt # Disallow: /hoteles/ # es # Disallow: /h%C3%B4tels/ # fr # Disallow: /%E3%83%9B%E3%83%86%E3%83%AB/ # ja # Disallow: /noclegi/ # pl # Disallow: /alberghi/ # it # Disallow: /hotels/ # en nl de ca # Disallow: /%CE%BE%CE%B5%CE%BD%CE%BF%CE%B4%CE%BF%CF%87%CE%B5%CE%AF%CE%B1/ # el # Disallow: /%D0%B3%D0%BE%D1%81%D1%82%D0%B8%D0%BD%D0%B8%D1%86%D1%8B/ # ru # Disallow: /%E9%85%92%E5%BA%97/ # zh-tw zh-cn # ===== AGENTS NOT SPECIFIED ABOVE ===== User-agent: * Disallow: Crawl-delay: 180 # 180= 3600 sec/hr / 180 sec/hit = 20 pg/hr(/lang) * 17 langs = 340 pgs/hr MAX Request-rate: 1/180 # maximum rate is x pages every y seconds Visit-time: 2300-0800 # only visit between xx:xx and yy:yy UT (GMT) # === BANNED === User-agent: psbot Disallow: / # http://www.picsearch.com/menu.cgi?item=Psbot # Did not heed crawl-delay within 14 days, banning. # === NEED GUIDANCE === User-agent: linksmanager_bot Crawl-delay: 240 # http://linksmanager.com/linkchecker.html # Disallow: /hotels/ Disallow: /hot%C3%A9is/ Disallow: /hoteles/ Disallow: h%C3%B4tels/ Disallow: /alberghi/ Disallow: /noclegi/ Disallow: /%CE%BE%CE%B5%CE%BD%CE%BF%CE%B4%CE%BF%CF%87%CE%B5%CE%AF%CE%B1/ Disallow: /%D0%B3%D0%BE%D1%81%D1%82%D0%B8%D0%BD%D0%B8%D1%86%D1%8B/ Disallow: /%E9%85%92%E5%BA%97/ Disallow: /%E9%85%92%E5%BA%97/ Disallow: /%E3%83%9B%E3%83%86%E3%83%AB/ User-agent: linksmanager Crawl-delay: 240 Disallow: /hotels/ Disallow: /hot%C3%A9is/ Disallow: /hoteles/ Disallow: /h%C3%B4tels/ Disallow: /alberghi/ Disallow: /noclegi/ Disallow: /%CE%BE%CE%B5%CE%BD%CE%BF%CE%B4%CE%BF%CF%87%CE%B5%CE%AF%CE%B1/ Disallow: /%D0%B3%D0%BE%D1%81%D1%82%D0%B8%D0%BD%D0%B8%D1%86%D1%8B/ Disallow: /%E9%85%92%E5%BA%97/ Disallow: /%E9%85%92%E5%BA%97/ Disallow: /%E3%83%9B%E3%83%86%E3%83%AB/ # ===== THANKS TO WIKIPEDIA.ORG/robots.txt (AUG06: ===== # === DISREGARD ROBOTS.TXT === User-agent: k2spider Disallow: / # Doesn't follow robots.txt anyway: # === DEAD-LINK CHECKERS & SITE MONITORING === User-agent: Xenu Disallow: Crawl-delay: 240 User-agent: ZyBORG Disallow: Crawl-delay: 240 # Some relation with http://www.wisenutbot.com/ User-agent: sitecheck.internetseer.com Disallow: Crawl-delay: 240 # Website monitoring service # === PROGRAMMING TOOLS (can be used as strippers?) === User-agent: Microsoft.URL.Control Disallow: / # === GENERIC CRAWLERS (may be strippers) === User-agent: larbin Disallow: / # Open-source crawler # larbin.sourceforge.net/index-eng.html User-agent: libwww Disallow: / # Web API created by W3C for robots, etc. # http://www.w3.org/Library/ # === SITE SCRAPERS & OFFLINE VIEWERS === User-agent: HTTrack Disallow: / # Site scraper # http://www.httrack.com/ User-agent: MSIECrawler Disallow: / User-agent: SiteSnagger Disallow: / User-agent: WebStripper Disallow: / User-agent: WebCopier Disallow: / User-agent: Fetch Disallow: / User-agent: Offline Explorer Disallow: / User-agent: Teleport Disallow: / User-agent: TeleportPro Disallow: / User-agent: WebZIP Disallow: / User-agent: Download Ninja Disallow: / User-agent: wget Disallow: / # Wget is a free (GNU) scrapper # Sorry, wget in its recursive mode is a frequent problem. # Please read the man page and use it properly; there is a # --wait option you can use to set the delay between hits, # for instance. User-agent: WebReaper Disallow: / # A capture bot, downloads gazillions of pages with no public benefit # http://www.webreaper.net/ # ===== Useful Sites ===== # List of Agents & if good or malign: # http://www.infosyssec.com/infosyssec/security/useragentstrings.shtml