# PaCkAgE DaTaStReAm SUNWurlgrabber 1 675 # end of header 0707010002bbf5000081a40000000000000000000000014ae194b3000001d4000000b600010006ffffffffffffffff0000001700000000SUNWurlgrabber/pkginfoPKG=SUNWurlgrabber NAME=Download Tool ARCH=i386 VERSION=11.11,REV=2009.10.23.10.12 SUNW_PRODNAME=SunOS SUNW_PRODVERS=5.11/SunOS Development SUNW_PKGTYPE=usr SUNW_PKG_ALLZONES=true SUNW_PKG_HOLLOW=true SUNW_PKG_THISZONE=false MAXINST=1000 CATEGORY=system DESC=Command-line tool for downloading files over HTTP VENDOR=Sun Microsystems, Inc. HOTLINE=Please contact your local service provider EMAIL= CLASSES=none BASEDIR=/ SUNW_PKGVERS=1.0 PSTAMP=priscilla20091023113411 0707010002bbf4000081a40000000000000000000000014ae194b300000752000000b600010006ffffffffffffffff0000001600000000SUNWurlgrabber/pkgmap: 1 675 1 i copyright 24389 28408 1256282684 1 i depend 1715 15424 1256282684 1 i pkginfo 468 37426 1256297651 1 d none usr 0755 root sys 1 d none usr/bin 0755 root bin 1 f none usr/bin/urlgrabber 0555 root bin 4868 25234 1256297645 1 d none usr/lib 0755 root bin 1 d none usr/lib/python2.4 0755 root bin 1 d none usr/lib/python2.4/vendor-packages 0755 root bin 1 d none usr/lib/python2.4/vendor-packages/urlgrabber 0755 root bin 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/__init__.py 0644 root bin 2259 62723 1158886735 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/__init__.pyc 0644 root bin 1746 9295 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/byterange.py 0644 root bin 17157 49128 1153426558 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/byterange.pyc 0644 root bin 15737 38795 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/grabber.py 0644 root bin 56807 7888 1158886685 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/grabber.pyc 0644 root bin 51025 64169 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/keepalive.py 0644 root bin 21089 26946 1158886685 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/keepalive.pyc 0644 root bin 19228 37771 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/mirror.py 0644 root bin 18069 45530 1140632806 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/mirror.pyc 0644 root bin 16276 5380 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/progress.py 0644 root bin 18235 31218 1124488747 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/progress.pyc 0644 root bin 17055 61012 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/sslfactory.py 0644 root bin 3099 50637 1158886685 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/sslfactory.pyc 0644 root bin 2713 58652 1256297645 07070100000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000b00000000TRAILER!!!0707010002bbf5000081a40000000000000000000000014ae194b3000001d4000000b600010006ffffffffffffffff0000000800000000pkginfoPKG=SUNWurlgrabber NAME=Download Tool ARCH=i386 VERSION=11.11,REV=2009.10.23.10.12 SUNW_PRODNAME=SunOS SUNW_PRODVERS=5.11/SunOS Development SUNW_PKGTYPE=usr SUNW_PKG_ALLZONES=true SUNW_PKG_HOLLOW=true SUNW_PKG_THISZONE=false MAXINST=1000 CATEGORY=system DESC=Command-line tool for downloading files over HTTP VENDOR=Sun Microsystems, Inc. HOTLINE=Please contact your local service provider EMAIL= CLASSES=none BASEDIR=/ SUNW_PKGVERS=1.0 PSTAMP=priscilla20091023113411 0707010002bbf4000081a40000000000000000000000014ae194b300000752000000b600010006ffffffffffffffff0000000700000000pkgmap: 1 675 1 i copyright 24389 28408 1256282684 1 i depend 1715 15424 1256282684 1 i pkginfo 468 37426 1256297651 1 d none usr 0755 root sys 1 d none usr/bin 0755 root bin 1 f none usr/bin/urlgrabber 0555 root bin 4868 25234 1256297645 1 d none usr/lib 0755 root bin 1 d none usr/lib/python2.4 0755 root bin 1 d none usr/lib/python2.4/vendor-packages 0755 root bin 1 d none usr/lib/python2.4/vendor-packages/urlgrabber 0755 root bin 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/__init__.py 0644 root bin 2259 62723 1158886735 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/__init__.pyc 0644 root bin 1746 9295 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/byterange.py 0644 root bin 17157 49128 1153426558 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/byterange.pyc 0644 root bin 15737 38795 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/grabber.py 0644 root bin 56807 7888 1158886685 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/grabber.pyc 0644 root bin 51025 64169 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/keepalive.py 0644 root bin 21089 26946 1158886685 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/keepalive.pyc 0644 root bin 19228 37771 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/mirror.py 0644 root bin 18069 45530 1140632806 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/mirror.pyc 0644 root bin 16276 5380 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/progress.py 0644 root bin 18235 31218 1124488747 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/progress.pyc 0644 root bin 17055 61012 1256297645 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/sslfactory.py 0644 root bin 3099 50637 1158886685 1 f none usr/lib/python2.4/vendor-packages/urlgrabber/sslfactory.pyc 0644 root bin 2713 58652 1256297645 0707010002bbf6000041ed0000000000000000000000024ae194b300000000000000b600010006ffffffffffffffff0000000800000000install0707010002bbf8000081a40000000000000000000000014ae15a3c000006b3000000b600010006ffffffffffffffff0000000f00000000install/depend# # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # This package information file defines software dependencies associated # with the pkg. You can define three types of pkg dependencies with this file: # P indicates a prerequisite for installation # I indicates an incompatible package # R indicates a reverse dependency # see pkginfo(4), PKG parameter # see pkginfo(4), NAME parameter # see pkginfo(4), VERSION parameter # see pkginfo(4), ARCH parameter # # () # () # ... # # ... P SUNWcar Core Architecture, (Root) P SUNWkvm Core Architecture, (Kvm) P SUNWcsr Core Solaris, (Root) P SUNWcsu Core Solaris, (Usr) P SUNWcsd Core Solaris Devices P SUNWcsl Core Solaris Libraries P SUNWPython The Python interpreter, libraries, and utilties 0707010002bbf7000081a40000000000000000000000014ae15a3c00005f45000000b600010006ffffffffffffffff0000001200000000install/copyright GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. GNU LESSER GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS 0707010002bbf9000041ed0000000000000000000000034ae194b300000000000000b600010006ffffffffffffffff0000000600000000reloc0707010002bbfa000041ed0000000000000000000000044ae194b300000000000000b600010006ffffffffffffffff0000000a00000000reloc/usr0707010002bbfd000041ed0000000000000000000000034ae194b300000000000000b600010006ffffffffffffffff0000000e00000000reloc/usr/lib0707010002bbfe000041ed0000000000000000000000034ae194b300000000000000b600010006ffffffffffffffff0000001800000000reloc/usr/lib/python2.40707010002bbff000041ed0000000000000000000000034ae194b300000000000000b600010006ffffffffffffffff0000002800000000reloc/usr/lib/python2.4/vendor-packages0707010002bc00000041ed0000000000000000000000024ae194b300000000000000b600010006ffffffffffffffff0000003300000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber0707010002bc02000081a40000000000000002000000014ae194ad000006d2000000b600010006ffffffffffffffff0000004000000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/__init__.pycm O5Ec@s;dZdZdZdZdZdklZlZlZdS(sA high-level cross-protocol url-grabber. Using urlgrabber, data can be fetched in three basic ways: urlgrab(url) copy the file to the local filesystem urlopen(url) open the remote file and return a file object (like urllib2.urlopen) urlread(url) return the contents of the file as a string When using these functions (or methods), urlgrabber supports the following features: * identical behavior for http://, ftp://, and file:// urls * http keepalive - faster downloads of many files by using only a single connection * byte ranges - fetch only a portion of the file * reget - for a urlgrab, resume a partial download * progress meters - the ability to report download progress automatically, even when using urlopen! * throttling - restrict bandwidth usage * retries - automatically retry a download if it fails. The number of retries and failure types are configurable. * authenticated server access for http and ftp * proxy support - support for authenticated http and ftp proxies * mirror groups - treat a list of mirrors as a single source, automatically switching mirrors if there is a failure. s3.1.0s 2006/09/21sPMichael D. Stenner , Ryan Tomayko s*http://linux.duke.edu/projects/urlgrabber/(surlgrabsurlopensurlreadN( t__doc__t __version__t__date__t __author__t__url__tgrabberturlgrabturlopenturlread(RRRRRRR((t_/export/xvm-gate/xvm-3.4///proto/install/usr/lib/python2.4/site-packages/urlgrabber/__init__.pyt?-s 0707010002bc07000081a40000000000000002000000014513351d00005261000000b600010006ffffffffffffffff0000004000000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/keepalive.py# This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., # 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko """An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive. >>> import urllib2 >>> from keepalive import HTTPHandler >>> keepalive_handler = HTTPHandler() >>> opener = urllib2.build_opener(keepalive_handler) >>> urllib2.install_opener(opener) >>> >>> fo = urllib2.urlopen('http://www.python.org') If a connection to a given host is requested, and all of the existing connections are still in use, another connection will be opened. If the handler tries to use an existing connection but it fails in some way, it will be closed and removed from the pool. To remove the handler, simply re-run build_opener with no arguments, and install that opener. You can explicitly close connections by using the close_connection() method of the returned file-like object (described below) or you can use the handler methods: close_connection(host) close_all() open_connections() NOTE: using the close_connection and close_all methods of the handler should be done with care when using multiple threads. * there is nothing that prevents another thread from creating new connections immediately after connections are closed * no checks are done to prevent in-use connections from being closed >>> keepalive_handler.close_all() EXTRA ATTRIBUTES AND METHODS Upon a status of 200, the object returned has a few additional attributes and methods, which should not be used if you want to remain consistent with the normal urllib2-returned objects: close_connection() - close the connection to the host readlines() - you know, readlines() status - the return status (ie 404) reason - english translation of status (ie 'File not found') If you want the best of both worlds, use this inside an AttributeError-catching try: >>> try: status = fo.status >>> except AttributeError: status = None Unfortunately, these are ONLY there if status == 200, so it's not easy to distinguish between non-200 responses. The reason is that urllib2 tries to do clever things with error codes 301, 302, 401, and 407, and it wraps the object upon return. For python versions earlier than 2.4, you can avoid this fancy error handling by setting the module-level global HANDLE_ERRORS to zero. You see, prior to 2.4, it's the HTTP Handler's job to determine what to handle specially, and what to just pass up. HANDLE_ERRORS == 0 means "pass everything up". In python 2.4, however, this job no longer belongs to the HTTP Handler and is now done by a NEW handler, HTTPErrorProcessor. Here's the bottom line: python version < 2.4 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as errors HANDLE_ERRORS == 0 pass everything up, error processing is left to the calling code python version >= 2.4 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors HANDLE_ERRORS == 0 (default) pass everything up, let the other handlers (specifically, HTTPErrorProcessor) decide what to do In practice, setting the variable either way makes little difference in python 2.4, so for the most consistent behavior across versions, you probably just want to use the defaults, which will give you exceptions on errors. """ # $Id: keepalive.py,v 1.16 2006/09/22 00:58:05 mstenner Exp $ import urllib2 import httplib import socket import thread DEBUG = None import sslfactory import sys if sys.version_info < (2, 4): HANDLE_ERRORS = 1 else: HANDLE_ERRORS = 0 class ConnectionManager: """ The connection manager must be able to: * keep track of all existing """ def __init__(self): self._lock = thread.allocate_lock() self._hostmap = {} # map hosts to a list of connections self._connmap = {} # map connections to host self._readymap = {} # map connection to ready state def add(self, host, connection, ready): self._lock.acquire() try: if not self._hostmap.has_key(host): self._hostmap[host] = [] self._hostmap[host].append(connection) self._connmap[connection] = host self._readymap[connection] = ready finally: self._lock.release() def remove(self, connection): self._lock.acquire() try: try: host = self._connmap[connection] except KeyError: pass else: del self._connmap[connection] del self._readymap[connection] self._hostmap[host].remove(connection) if not self._hostmap[host]: del self._hostmap[host] finally: self._lock.release() def set_ready(self, connection, ready): try: self._readymap[connection] = ready except KeyError: pass def get_ready_conn(self, host): conn = None self._lock.acquire() try: if self._hostmap.has_key(host): for c in self._hostmap[host]: if self._readymap[c]: self._readymap[c] = 0 conn = c break finally: self._lock.release() return conn def get_all(self, host=None): if host: return list(self._hostmap.get(host, [])) else: return dict(self._hostmap) class KeepAliveHandler: def __init__(self): self._cm = ConnectionManager() #### Connection Management def open_connections(self): """return a list of connected hosts and the number of connections to each. [('foo.com:80', 2), ('bar.org', 1)]""" return [(host, len(li)) for (host, li) in self._cm.get_all().items()] def close_connection(self, host): """close connection(s) to host is the host:port spec, as in 'www.cnn.com:8080' as passed in. no error occurs if there is no connection to that host.""" for h in self._cm.get_all(host): self._cm.remove(h) h.close() def close_all(self): """close all open connections""" for host, conns in self._cm.get_all().items(): for h in conns: self._cm.remove(h) h.close() def _request_closed(self, request, host, connection): """tells us that this request is now closed and the the connection is ready for another request""" self._cm.set_ready(connection, 1) def _remove_connection(self, host, connection, close=0): if close: connection.close() self._cm.remove(connection) #### Transaction Execution def do_open(self, req): host = req.get_host() if not host: raise urllib2.URLError('no host given') try: h = self._cm.get_ready_conn(host) while h: r = self._reuse_connection(h, req, host) # if this response is non-None, then it worked and we're # done. Break out, skipping the else block. if r: break # connection is bad - possibly closed by server # discard it and ask for the next free connection h.close() self._cm.remove(h) h = self._cm.get_ready_conn(host) else: # no (working) free connections were found. Create a new one. h = self._get_connection(host) if DEBUG: DEBUG.info("creating new connection to %s (%d)", host, id(h)) self._cm.add(host, h, 0) self._start_transaction(h, req) r = h.getresponse() except (socket.error, httplib.HTTPException), err: raise urllib2.URLError(err) # if not a persistent connection, don't try to reuse it if r.will_close: self._cm.remove(h) if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason) r._handler = self r._host = host r._url = req.get_full_url() r._connection = h r.code = r.status r.headers = r.msg r.msg = r.reason if r.status == 200 or not HANDLE_ERRORS: return r else: return self.parent.error('http', req, r, r.status, r.msg, r.headers) def _reuse_connection(self, h, req, host): """start the transaction with a re-used connection return a response object (r) upon success or None on failure. This DOES not close or remove bad connections in cases where it returns. However, if an unexpected exception occurs, it will close and remove the connection before re-raising. """ try: self._start_transaction(h, req) r = h.getresponse() # note: just because we got something back doesn't mean it # worked. We'll check the version below, too. except (socket.error, httplib.HTTPException): r = None except: # adding this block just in case we've missed # something we will still raise the exception, but # lets try and close the connection and remove it # first. We previously got into a nasty loop # where an exception was uncaught, and so the # connection stayed open. On the next try, the # same exception was raised, etc. The tradeoff is # that it's now possible this call will raise # a DIFFERENT exception if DEBUG: DEBUG.error("unexpected exception - closing " + \ "connection to %s (%d)", host, id(h)) self._cm.remove(h) h.close() raise if r is None or r.version == 9: # httplib falls back to assuming HTTP 0.9 if it gets a # bad header back. This is most likely to happen if # the socket has been closed by the server since we # last used the connection. if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)", host, id(h)) r = None else: if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h)) return r def _start_transaction(self, h, req): try: if req.has_data(): data = req.get_data() h.putrequest('POST', req.get_selector()) if not req.headers.has_key('Content-type'): h.putheader('Content-type', 'application/x-www-form-urlencoded') if not req.headers.has_key('Content-length'): h.putheader('Content-length', '%d' % len(data)) else: h.putrequest('GET', req.get_selector()) except (socket.error, httplib.HTTPException), err: raise urllib2.URLError(err) for args in self.parent.addheaders: h.putheader(*args) for k, v in req.headers.items(): h.putheader(k, v) h.endheaders() if req.has_data(): h.send(data) def _get_connection(self, host): return NotImplementedError class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler): def __init__(self): KeepAliveHandler.__init__(self) def http_open(self, req): return self.do_open(req) def _get_connection(self, host): return HTTPConnection(host) class HTTPSHandler(KeepAliveHandler, urllib2.HTTPSHandler): def __init__(self, ssl_factory=None): KeepAliveHandler.__init__(self) if not ssl_factory: ssl_factory = sslfactory.get_factory() self._ssl_factory = ssl_factory def https_open(self, req): return self.do_open(req) def _get_connection(self, host): return self._ssl_factory.get_https_connection(host) class HTTPResponse(httplib.HTTPResponse): # we need to subclass HTTPResponse in order to # 1) add readline() and readlines() methods # 2) add close_connection() methods # 3) add info() and geturl() methods # in order to add readline(), read must be modified to deal with a # buffer. example: readline must read a buffer and then spit back # one line at a time. The only real alternative is to read one # BYTE at a time (ick). Once something has been read, it can't be # put back (ok, maybe it can, but that's even uglier than this), # so if you THEN do a normal read, you must first take stuff from # the buffer. # the read method wraps the original to accomodate buffering, # although read() never adds to the buffer. # Both readline and readlines have been stolen with almost no # modification from socket.py def __init__(self, sock, debuglevel=0, strict=0, method=None): if method: # the httplib in python 2.3 uses the method arg httplib.HTTPResponse.__init__(self, sock, debuglevel, method) else: # 2.2 doesn't httplib.HTTPResponse.__init__(self, sock, debuglevel) self.fileno = sock.fileno self.code = None self._rbuf = '' self._rbufsize = 8096 self._handler = None # inserted by the handler later self._host = None # (same) self._url = None # (same) self._connection = None # (same) _raw_read = httplib.HTTPResponse.read def close(self): if self.fp: self.fp.close() self.fp = None if self._handler: self._handler._request_closed(self, self._host, self._connection) def close_connection(self): self._handler._remove_connection(self._host, self._connection, close=1) self.close() def info(self): return self.headers def geturl(self): return self._url def read(self, amt=None): # the _rbuf test is only in this first if for speed. It's not # logically necessary if self._rbuf and not amt is None: L = len(self._rbuf) if amt > L: amt -= L else: s = self._rbuf[:amt] self._rbuf = self._rbuf[amt:] return s s = self._rbuf + self._raw_read(amt) self._rbuf = '' return s def readline(self, limit=-1): data = "" i = self._rbuf.find('\n') while i < 0 and not (0 < limit <= len(self._rbuf)): new = self._raw_read(self._rbufsize) if not new: break i = new.find('\n') if i >= 0: i = i + len(self._rbuf) self._rbuf = self._rbuf + new if i < 0: i = len(self._rbuf) else: i = i+1 if 0 <= limit < len(self._rbuf): i = limit data, self._rbuf = self._rbuf[:i], self._rbuf[i:] return data def readlines(self, sizehint = 0): total = 0 list = [] while 1: line = self.readline() if not line: break list.append(line) total += len(line) if sizehint and total >= sizehint: break return list class HTTPConnection(httplib.HTTPConnection): # use the modified response class response_class = HTTPResponse class HTTPSConnection(httplib.HTTPSConnection): response_class = HTTPResponse ######################################################################### ##### TEST FUNCTIONS ######################################################################### def error_handler(url): global HANDLE_ERRORS orig = HANDLE_ERRORS keepalive_handler = HTTPHandler() opener = urllib2.build_opener(keepalive_handler) urllib2.install_opener(opener) pos = {0: 'off', 1: 'on'} for i in (0, 1): print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i) HANDLE_ERRORS = i try: fo = urllib2.urlopen(url) foo = fo.read() fo.close() try: status, reason = fo.status, fo.reason except AttributeError: status, reason = None, None except IOError, e: print " EXCEPTION: %s" % e raise else: print " status = %s, reason = %s" % (status, reason) HANDLE_ERRORS = orig hosts = keepalive_handler.open_connections() print "open connections:", hosts keepalive_handler.close_all() def continuity(url): import md5 format = '%25s: %s' # first fetch the file with the normal http handler opener = urllib2.build_opener() urllib2.install_opener(opener) fo = urllib2.urlopen(url) foo = fo.read() fo.close() m = md5.new(foo) print format % ('normal urllib', m.hexdigest()) # now install the keepalive handler and try again opener = urllib2.build_opener(HTTPHandler()) urllib2.install_opener(opener) fo = urllib2.urlopen(url) foo = fo.read() fo.close() m = md5.new(foo) print format % ('keepalive read', m.hexdigest()) fo = urllib2.urlopen(url) foo = '' while 1: f = fo.readline() if f: foo = foo + f else: break fo.close() m = md5.new(foo) print format % ('keepalive readline', m.hexdigest()) def comp(N, url): print ' making %i connections to:\n %s' % (N, url) sys.stdout.write(' first using the normal urllib handlers') # first use normal opener opener = urllib2.build_opener() urllib2.install_opener(opener) t1 = fetch(N, url) print ' TIME: %.3f s' % t1 sys.stdout.write(' now using the keepalive handler ') # now install the keepalive handler and try again opener = urllib2.build_opener(HTTPHandler()) urllib2.install_opener(opener) t2 = fetch(N, url) print ' TIME: %.3f s' % t2 print ' improvement factor: %.2f' % (t1/t2, ) def fetch(N, url, delay=0): import time lens = [] starttime = time.time() for i in range(N): if delay and i > 0: time.sleep(delay) fo = urllib2.urlopen(url) foo = fo.read() fo.close() lens.append(len(foo)) diff = time.time() - starttime j = 0 for i in lens[1:]: j = j + 1 if not i == lens[0]: print "WARNING: inconsistent length on read %i: %i" % (j, i) return diff def test_timeout(url): global DEBUG dbbackup = DEBUG class FakeLogger: def debug(self, msg, *args): print msg % args info = warning = error = debug DEBUG = FakeLogger() print " fetching the file to establish a connection" fo = urllib2.urlopen(url) data1 = fo.read() fo.close() i = 20 print " waiting %i seconds for the server to close the connection" % i while i > 0: sys.stdout.write('\r %2i' % i) sys.stdout.flush() time.sleep(1) i -= 1 sys.stderr.write('\r') print " fetching the file a second time" fo = urllib2.urlopen(url) data2 = fo.read() fo.close() if data1 == data2: print ' data are identical' else: print ' ERROR: DATA DIFFER' DEBUG = dbbackup def test(url, N=10): print "checking error hander (do this on a non-200)" try: error_handler(url) except IOError, e: print "exiting - exception will prevent further tests" sys.exit() print print "performing continuity test (making sure stuff isn't corrupted)" continuity(url) print print "performing speed comparison" comp(N, url) print print "performing dropped-connection check" test_timeout(url) if __name__ == '__main__': import time import sys try: N = int(sys.argv[1]) url = sys.argv[2] except: print "%s " % sys.argv[0] else: test(url, N) 0707010002bc0d000081a40000000000000002000000014513351d00000c1b000000b600010006ffffffffffffffff0000004100000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/sslfactory.py# This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., # 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber import httplib import urllib2 try: from M2Crypto import SSL from M2Crypto import httpslib from M2Crypto import m2urllib2 have_m2crypto = True except ImportError: have_m2crypto = False DEBUG = None if have_m2crypto: class M2SSLFactory: def __init__(self, ssl_ca_cert, ssl_context): self.ssl_context = self._get_ssl_context(ssl_ca_cert, ssl_context) def _get_ssl_context(self, ssl_ca_cert, ssl_context): """ Create an ssl context using the CA cert file or ssl context. The CA cert is used first if it was passed as an option. If not, then the supplied ssl context is used. If no ssl context was supplied, None is returned. """ if ssl_ca_cert: context = SSL.Context() context.load_verify_locations(ssl_ca_cert) context.set_verify(SSL.verify_peer, -1) return context else: return ssl_context def create_https_connection(self, host, response_class = None): connection = httplib.HTTPSConnection(host, self.ssl_context) if response_class: connection.response_class = response_class return connection def create_opener(self, *handlers): return m2urllib2.build_opener(self.ssl_context, *handlers) class SSLFactory: def create_https_connection(self, host, response_class = None): connection = httplib.HTTPSConnection(host) if response_class: connection.response_class = response_class return connection def create_opener(self, *handlers): return urllib2.build_opener(*handlers) def get_factory(ssl_ca_cert = None, ssl_context = None): """ Return an SSLFactory, based on if M2Crypto is available. """ if have_m2crypto: return M2SSLFactory(ssl_ca_cert, ssl_context) else: # Log here if someone provides the args but we don't use them. if ssl_ca_cert or ssl_context: if DEBUG: DEBUG.warning("SSL arguments supplied, but M2Crypto is not available. " "Using Python SSL.") return SSLFactory() 0707010002bc04000081a40000000000000002000000014ae194ad00003d79000000b600010006ffffffffffffffff0000004100000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/byterange.pycm ~Dc@sdkZdkZdkZdkZdkZdZydklZWn"e j oZ dklZnXde fdYZ dei fdYZdefdYZdfd YZd eifd YZd klZlZlZlZlZlZlZdkZdkZdkZdkZdkZdkZd eifdYZ dei!fdYZ!da"dZ#dZ$dZ%dS(N(sStringIOt RangeErrorcBstZdZRS(s6Error raised when an unsatisfiable range is requested.(t__name__t __module__t__doc__(((t`/export/xvm-gate/xvm-3.4///proto/install/usr/lib/python2.4/site-packages/urlgrabber/byterange.pyR#s tHTTPRangeHandlercBs tZdZdZdZRS(sHandler that enables HTTP Range headers. This was extremely simple. The Range header is a HTTP feature to begin with so all this class does is tell urllib2 that the "206 Partial Content" reponse from the HTTP server is what we expected. Example: import urllib2 import byterange range_handler = range.HTTPRangeHandler() opener = urllib2.build_opener(range_handler) # install it urllib2.install_opener(opener) # create Request and set Range header req = urllib2.Request('http://www.python.org/') req.header['Range'] = 'bytes=30-50' f = urllib2.urlopen(req) cCs1ti|||i}||_||_|S(N( turllibt addinfourltfpthdrstreqt get_full_urltrtcodetmsg(tselfR RR RR R ((Rthttp_error_206?s  cCstddS(NsRequested Range Not Satisfiable(R(RR RR RR ((Rthttp_error_416Fs(RRRRR(((RR's  tHTTPSRangeHandlercBs tZdZdZdZRS(s! Range Header support for HTTPS. cCs|i|||||S(N(RRR RR RR (RR RR RR ((Rthttps_error_206MscCs|i|||||dS(N(Rthttps_error_416R RR RR (RR RR RR ((RRPs(RRRRR(((RRJs  tRangeableFileObjectcBshtZdZdZdZdZddZddZddZd Z d Z d Z RS( s"File object wrapper to enable raw range handling. This was implemented primarilary for handling range specifications for file:// urls. This object effectively makes a file object look like it consists only of a range of bytes in the stream. Examples: # expose 10 bytes, starting at byte position 20, from # /etc/aliases. >>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30)) # seek seeks within the range (to position 23 in this case) >>> fo.seek(3) # tell tells where your at _within the range_ (position 3 in # this case) >>> fo.tell() # read EOFs if an attempt is made to read past the last # byte in the range. the following will return only 7 bytes. >>> fo.read(30) cCs>||_t|\|_|_d|_|i|idS(sCreate a RangeableFileObject. fo -- a file like object. only the read() method need be supported but supporting an optimized seek() is preferable. rangetup -- a (firstbyte,lastbyte) tuple specifying the range to work over. The file object provided is assumed to be at byte offset 0. iN(tfoRtrange_tuple_normalizetrangetupt firstbytetlastbytetrealpost_do_seek(RRR((Rt__init__hs   cCs4t|i|ot|i|Snt|dS(sThis effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for in self.fo. This includes methods.N(thasattrRRtnametgetattrtAttributeError(RR((Rt __getattr__vscCs|i|iS(s5Return the position within the range. This is different from fo.seek in that position 0 is the first byte position of the range tuple. For example, if this object was created with a range tuple of (500,899), tell() will return 0 when at byte position 500 of the file. N(RRR(R((Rttell~sicCs|djpt|djo|i|}n<|djo|i|}n|djotdn|io||ijo |i}n|i ||idS(seSeek within the byte range. Positioning is identical to that described under tell(). iiis$seek from end of file not supported.N(iii( twhencetAssertionErrorRRtoffsett realoffsetRtIOErrorRR(RR&R$R'((Rtseeks    icCs:|i|}|ii|}|it|7_|S(s`Read within the range. This method will limit the size read based on the range. N(Rt_calc_read_sizetsizeRtreadtrsltRtlen(RR+R-((RR,s cCs:|i|}|ii|}|it|7_|S(sfRead lines within the range. This method will limit the size read based on the range. N(RR*R+RtreadlineR-RR.(RR+R-((RR/s cCs^|ioP|djo/|i||ijo|i|i}qVqZ|i|i}n|S(sSHandles calculating the amount of data to read based on the range. iN(RRR+R(RR+((RR*s  cCsb|djptt|idp|i|n|ii|i||i|7_dS(sSeek based on whether wrapped object supports seek(). offset is relative to the current position (self.realpos). iR)N(R&R%RRRt_poor_mans_seekR)R(RR&((RRs cCsd}d}xp||job|||jo||}n|ii|}t||jotdn||7}qWdS(sSeek by calling the wrapped file objects read() method. This is used for file like objects that do not have native seek support. The wrapped objects read() method is called to manually seek to the desired position. offset -- read this number of bytes from the wrapped file object. raise RangeError if we encounter EOF before reaching the specified offset. iisRequested Range Not SatisfiableN( tpostbufsizeR&RRR,tbufR.R(RR&R1R2R3((RR0s  ( RRRRR"R#R)R,R/R*RR0(((RRSs     tFileRangeHandlercBstZdZdZRS(s~FileHandler subclass that adds Range support. This class handles Range headers exactly like an HTTP server would. cCsdk} dk}|i} |i}ti|} t i | }|t i }ti|t i}| i|d}| oOti| \} }|pti| |ijotidqnt| d}|ii dd}t#|}|fjpt$|o}|\} } | djo |} n| djp| |jp | |jot'dn| | }t(|| | f}n|i)t*d|pd||f}ti+||d |S( Nisfile not on local hosttrbtRangetsRequested Range Not Satisfiables6Content-Type: %s Content-Length: %d Last-modified: %s s text/plainsfile:(,t mimetypest mimetoolsR tget_hostthostt get_selectortfileRt url2pathnamet localfiletoststattstatstST_SIZER+trfc822t formatdatetST_MTIMEtmodifiedt guess_typetmtypet splitporttporttsockett gethostbynameRt get_namesturllib2tURLErrortopenRtheaderstgettNonetbrangetrange_header_to_tupleR%tfbtlbRRtMessagetStringIOR(RR R9RURIR=RKR+RBRWR?R8R;RXRRGRR((Rtopen_local_files6     #   ' )(RRRR[(((RR4s (s splitports splitusers splitpasswds splitattrsunquotes addclosehooks addinfourltFTPRangeHandlercBstZdZdZRS(NcCs|i}|p tdnt|\}}|djo ti}nt |\}}|ot |\}} nd} t |}t |pd}t | pd} yti|}Wn'tij o}ti|nXt|i\}}|id} tt | } | d | d} }| o| d o| d} ny7|i|| ||| } |odpd } xP|D]H}t|\}}|i"d jo|djo|i#} qqWd}t%|i&i'dd}|fjpt)|o'|\}} |djo |}qDn| i,|| |\}}|o|\}} | djo[|djp |djot/dn|} | |}|djot/dqq| |}t0|d|f}nd}t1i2|i3d}|o|d|7}n|dj o|djo|d|7}nt5|}t7i8|}t9|||i3SWn4ti:j o%}td|ft;i<dnXdS(Ns ftp errors no host givenR7t/iiitItDttypetatAtitdR6s@Requested Range Not Satisfiable due to unobtainable file length.sRequested Range Not SatisfiablesContent-Type: %s sContent-Length: %d i(s ftp errors no host given(RaRbRcR^RdR_(=R R:R;R(RJRKRTtftplibtFTP_PORTt splitusertusert splitpasswdtpasswdtunquoteRLRMterrorRRORPt splitattrR<tpathtattrstsplittdirstmapR=Rt connect_ftptfwR`tattrtvaluetlowertuppertrestRVRRRSt range_tupR%RWRXtretrfileRtretrlenRRR8RHR RIRZtsfR9RYRt all_errorstsystexc_info(RR RRzRyRIRoR=RKRXR`RqRtRjRR;RhRnR|RuRWRvRRR}((Rtftp_open sz              cCst|||||}|S(N(t ftpwrapperRhRjR;RKRqRt(RRhRjR;RKRqRt((RRsXs(RRRRs(((RR\ s LRcBstZddZRS(Nc Cs1|i|djod}d}nd|}d}y|ii|Wn0tij o!|i |ii|nXd} |o.| o&y|ii |Wn4tij o%}td|ftidnX|ii|y#d |}|ii||} Wqtij o}t|d d jo;|i||\}}t||d f}||fSqt|d d jo td|ftidqqXn| pA|iid|od|}nd}|ii|} nd|_t| did|i| dfS(NRdR_sTYPE AisTYPE is ftp errorisRETR it501R7t550sLIST tLISTR5(RdR_(Rt endtransferR`tcmdtisdirtftptvoidcmdReR~tinitRTtconnR=tnlstt error_permtreasonR(RRt ntransfercmdRytstrR{RR|Rtbusyt addclosehooktmakefile( RR=R`RyRRRRR|R((RR{asF   ! ) (RRRTR{(((RR\scCs|djodSntdjodk}|idanti|}|oNt|idd}|o'|do|d|ddf}n|SnfS(sGet a (firstbyte,lastbyte) tuple from a Range header value. Range headers have the form "bytes=-". This function pulls the firstbyte and lastbyte values and returns a (firstbyte,lastbyte) tuple. If lastbyte is not specified in the header value, it is returned as an empty string in the tuple. Return None if range_header is None Return () if range_header does not conform to the range spec pattern. Ns^bytes=(\d{1,})-(\d*)iii( t range_headerRTt_rangeretretcompiletmatchRtgroupttup(RRRR((RRVs    cCs_|djodSnt|}|o3|do|d|ddf}nd|SndS(sConvert a range tuple to a Range header value. Return a string of the form "bytes=-" or None if no range is needed. iis bytes=%s-%sN(RzRTR(Rz((Rtrange_tuple_to_headers   cCs|djodSn|d}|djo d}n t|}y|d}Wntj o d}n6X|djo d}n|djot|}n||fdjodSn||jotd||fn||fS(s7Normalize a (first_byte,last_byte) range tuple. Return a tuple whose first element is guaranteed to be an int and whose second element will be '' (meaning: the last byte) or an int. Finally, return None if the normalized tuple == (0,'') as that is equivelant to retrieving the entire file. iR7isInvalid byte range: %s-%sN(NR7(iR7(RzRTRWtintRXt IndexErrorR(RzRWRX((RRs(        (&R@RARRORDRTtDEBUGt cStringIORZt ImportErrorRR(Rt BaseHandlerRRRt FileHandlerR4RJRgRiRmRkRRReRLRR8R9t FTPHandlerR\RRRVRR(RkR9RRRRJRgRRR4RmR8ReRARRDRRORRRRRRLRZRVRRiR\R@((Rt?s6     # +1      Q4  0707010002bc0e000081a40000000000000002000000014ae194ad00000a99000000b600010006ffffffffffffffff0000004200000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/sslfactory.pycm 5Ec@sdkZdkZy1dklZdklZdklZeZWnej o e ZnXdZ eodfdYZ ndfdYZ dddZdS( N(sSSL(shttpslib(s m2urllib2t M2SSLFactorycBs/tZdZdZddZdZRS(NcCs|i|||_dS(N(tselft_get_ssl_contextt ssl_ca_certt ssl_context(RRR((ta/export/xvm-gate/xvm-3.4///proto/install/usr/lib/python2.4/site-packages/urlgrabber/sslfactory.pyt__init__%scCsC|o4ti}|i||itid|Sn|SdS(s Create an ssl context using the CA cert file or ssl context. The CA cert is used first if it was passed as an option. If not, then the supplied ssl context is used. If no ssl context was supplied, None is returned. iN(RtSSLtContexttcontexttload_verify_locationst set_verifyt verify_peerR(RRRR ((RR(s  cCs-ti||i}|o ||_n|S(N(thttplibtHTTPSConnectionthostRRt connectiontresponse_class(RRRR((Rtcreate_https_connection8s cGsti|i|S(N(t m2urllib2t build_openerRRthandlers(RR((Rt create_opener>s(t__name__t __module__RRtNoneRR(((RR#s   t SSLFactorycBstZddZdZRS(NcCs'ti|}|o ||_n|S(N(R RRRR(RRRR((RRDs cGs ti|S(N(turllib2RR(RR((RRJs(RRRRR(((RRBs cCsMtot||Sn2|p|ototidqBntSdS(s: Return an SSLFactory, based on if M2Crypto is available. sHSSL arguments supplied, but M2Crypto is not available. Using Python SSL.N(t have_m2cryptoRRRtDEBUGtwarningR(RR((Rt get_factoryOs(R RtM2CryptoRthttpslibRtTrueRt ImportErrortFalseRRRRR( RR RR!RRRRRR((Rt?s        0707010002bc05000081a40000000000000002000000014513351d0000dde7000000b600010006ffffffffffffffff0000003e00000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/grabber.py# This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., # 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko """A high-level cross-protocol url-grabber. GENERAL ARGUMENTS (kwargs) Where possible, the module-level default is indicated, and legal values are provided. copy_local = 0 [0|1] ignored except for file:// urls, in which case it specifies whether urlgrab should still make a copy of the file, or simply point to the existing copy. The module level default for this option is 0. close_connection = 0 [0|1] tells URLGrabber to close the connection after a file has been transfered. This is ignored unless the download happens with the http keepalive handler (keepalive=1). Otherwise, the connection is left open for further use. The module level default for this option is 0 (keepalive connections will not be closed). keepalive = 1 [0|1] specifies whether keepalive should be used for HTTP/1.1 servers that support it. The module level default for this option is 1 (keepalive is enabled). progress_obj = None a class instance that supports the following methods: po.start(filename, url, basename, length, text) # length will be None if unknown po.update(read) # read == bytes read so far po.end() text = None specifies an alternativ text item in the beginning of the progress bar line. If not given, the basename of the file is used. throttle = 1.0 a number - if it's an int, it's the bytes/second throttle limit. If it's a float, it is first multiplied by bandwidth. If throttle == 0, throttling is disabled. If None, the module-level default (which can be set on default_grabber.throttle) is used. See BANDWIDTH THROTTLING for more information. timeout = None a positive float expressing the number of seconds to wait for socket operations. If the value is None or 0.0, socket operations will block forever. Setting this option causes urlgrabber to call the settimeout method on the Socket object used for the request. See the Python documentation on settimeout for more information. http://www.python.org/doc/current/lib/socket-objects.html bandwidth = 0 the nominal max bandwidth in bytes/second. If throttle is a float and bandwidth == 0, throttling is disabled. If None, the module-level default (which can be set on default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for more information. range = None a tuple of the form (first_byte, last_byte) describing a byte range to retrieve. Either or both of the values may set to None. If first_byte is None, byte offset 0 is assumed. If last_byte is None, the last byte available is assumed. Note that the range specification is python-like in that (0,10) will yeild the first 10 bytes of the file. If set to None, no range will be used. reget = None [None|'simple'|'check_timestamp'] whether to attempt to reget a partially-downloaded file. Reget only applies to .urlgrab and (obviously) only if there is a partially downloaded file. Reget has two modes: 'simple' -- the local file will always be trusted. If there are 100 bytes in the local file, then the download will always begin 100 bytes into the requested file. 'check_timestamp' -- the timestamp of the server file will be compared to the timestamp of the local file. ONLY if the local file is newer than or the same age as the server file will reget be used. If the server file is newer, or the timestamp is not returned, the entire file will be fetched. NOTE: urlgrabber can do very little to verify that the partial file on disk is identical to the beginning of the remote file. You may want to either employ a custom "checkfunc" or simply avoid using reget in situations where corruption is a concern. user_agent = 'urlgrabber/VERSION' a string, usually of the form 'AGENT/VERSION' that is provided to HTTP servers in the User-agent header. The module level default for this option is "urlgrabber/VERSION". http_headers = None a tuple of 2-tuples, each containing a header and value. These will be used for http and https requests only. For example, you can do http_headers = (('Pragma', 'no-cache'),) ftp_headers = None this is just like http_headers, but will be used for ftp requests. proxies = None a dictionary that maps protocol schemes to proxy hosts. For example, to use a proxy server on host "foo" port 3128 for http and https URLs: proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' } note that proxy authentication information may be provided using normal URL constructs: proxies={ 'http' : 'http://user:host@foo:3128' } Lastly, if proxies is None, the default environment settings will be used. prefix = None a url prefix that will be prepended to all requested urls. For example: g = URLGrabber(prefix='http://foo.com/mirror/') g.urlgrab('some/file.txt') ## this will fetch 'http://foo.com/mirror/some/file.txt' This option exists primarily to allow identical behavior to MirrorGroup (and derived) instances. Note: a '/' will be inserted if necessary, so you cannot specify a prefix that ends with a partial file or directory name. opener = None Overrides the default urllib2.OpenerDirector provided to urllib2 when making requests. This option exists so that the urllib2 handler chain may be customized. Note that the range, reget, proxy, and keepalive features require that custom handlers be provided to urllib2 in order to function properly. If an opener option is provided, no attempt is made by urlgrabber to ensure chain integrity. You are responsible for ensuring that any extension handlers are present if said features are required. data = None Only relevant for the HTTP family (and ignored for other protocols), this allows HTTP POSTs. When the data kwarg is present (and not None), an HTTP request will automatically become a POST rather than GET. This is done by direct passthrough to urllib2. If you use this, you may also want to set the 'Content-length' and 'Content-type' headers with the http_headers option. Note that python 2.2 handles the case of these badly and if you do not use the proper case (shown here), your values will be overridden with the defaults. RETRY RELATED ARGUMENTS retry = None the number of times to retry the grab before bailing. If this is zero, it will retry forever. This was intentional... really, it was :). If this value is not supplied or is supplied but is None retrying does not occur. retrycodes = [-1,2,4,5,6,7] a sequence of errorcodes (values of e.errno) for which it should retry. See the doc on URLGrabError for more details on this. You might consider modifying a copy of the default codes rather than building yours from scratch so that if the list is extended in the future (or one code is split into two) you can still enjoy the benefits of the default list. You can do that with something like this: retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes if 12 not in retrycodes: retrycodes.append(12) checkfunc = None a function to do additional checks. This defaults to None, which means no additional checking. The function should simply return on a successful check. It should raise URLGrabError on an unsuccessful check. Raising of any other exception will be considered immediate failure and no retries will occur. If it raises URLGrabError, the error code will determine the retry behavior. Negative error numbers are reserved for use by these passed in functions, so you can use many negative numbers for different types of failure. By default, -1 results in a retry, but this can be customized with retrycodes. If you simply pass in a function, it will be given exactly one argument: a CallbackObject instance with the .url attribute defined and either .filename (for urlgrab) or .data (for urlread). For urlgrab, .filename is the name of the local file. For urlread, .data is the actual string data. If you need other arguments passed to the callback (program state of some sort), you can do so like this: checkfunc=(function, ('arg1', 2), {'kwarg': 3}) if the downloaded file has filename /tmp/stuff, then this will result in this call (for urlgrab): function(obj, 'arg1', 2, kwarg=3) # obj.filename = '/tmp/stuff' # obj.url = 'http://foo.com/stuff' NOTE: both the "args" tuple and "kwargs" dict must be present if you use this syntax, but either (or both) can be empty. failure_callback = None The callback that gets called during retries when an attempt to fetch a file fails. The syntax for specifying the callback is identical to checkfunc, except for the attributes defined in the CallbackObject instance. The attributes for failure_callback are: exception = the raised exception url = the url we're trying to fetch tries = the number of tries so far (including this one) retry = the value of the retry option The callback is present primarily to inform the calling program of the failure, but if it raises an exception (including the one it's passed) that exception will NOT be caught and will therefore cause future retries to be aborted. The callback is called for EVERY failure, including the last one. On the last try, the callback can raise an alternate exception, but it cannot (without severe trickiness) prevent the exception from being raised. interrupt_callback = None This callback is called if KeyboardInterrupt is received at any point in the transfer. Basically, this callback can have three impacts on the fetch process based on the way it exits: 1) raise no exception: the current fetch will be aborted, but any further retries will still take place 2) raise a URLGrabError: if you're using a MirrorGroup, then this will prompt a failover to the next mirror according to the behavior of the MirrorGroup subclass. It is recommended that you raise URLGrabError with code 15, 'user abort'. If you are NOT using a MirrorGroup subclass, then this is the same as (3). 3) raise some other exception (such as KeyboardInterrupt), which will not be caught at either the grabber or mirror levels. That is, it will be raised up all the way to the caller. This callback is very similar to failure_callback. They are passed the same arguments, so you could use the same function for both. urlparser = URLParser() The URLParser class handles pre-processing of URLs, including auth-handling for user/pass encoded in http urls, file handing (that is, filenames not sent as a URL), and URL quoting. If you want to override any of this behavior, you can pass in a replacement instance. See also the 'quote' option. quote = None Whether or not to quote the path portion of a url. quote = 1 -> quote the URLs (they're not quoted yet) quote = 0 -> do not quote them (they're already quoted) quote = None -> guess what to do This option only affects proper urls like 'file:///etc/passwd'; it does not affect 'raw' filenames like '/etc/passwd'. The latter will always be quoted as they are converted to URLs. Also, only the path part of a url is quoted. If you need more fine-grained control, you should probably subclass URLParser and pass it in via the 'urlparser' option. BANDWIDTH THROTTLING urlgrabber supports throttling via two values: throttle and bandwidth Between the two, you can either specify and absolute throttle threshold or specify a theshold as a fraction of maximum available bandwidth. throttle is a number - if it's an int, it's the bytes/second throttle limit. If it's a float, it is first multiplied by bandwidth. If throttle == 0, throttling is disabled. If None, the module-level default (which can be set with set_throttle) is used. bandwidth is the nominal max bandwidth in bytes/second. If throttle is a float and bandwidth == 0, throttling is disabled. If None, the module-level default (which can be set with set_bandwidth) is used. THROTTLING EXAMPLES: Lets say you have a 100 Mbps connection. This is (about) 10^8 bits per second, or 12,500,000 Bytes per second. You have a number of throttling options: *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float This will limit urlgrab to use half of your available bandwidth. *) set_throttle(6250000) # throttle is an int This will also limit urlgrab to use half of your available bandwidth, regardless of what bandwidth is set to. *) set_throttle(6250000); set_throttle(1.0) # float Use half your bandwidth *) set_throttle(6250000); set_throttle(2.0) # float Use up to 12,500,000 Bytes per second (your nominal max bandwidth) *) set_throttle(6250000); set_throttle(0) # throttle = 0 Disable throttling - this is more efficient than a very large throttle setting. *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0 Disable throttling - this is the default when the module is loaded. SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING) While this is flexible, it's not extremely obvious to the user. I suggest you implement a float throttle as a percent to make the distinction between absolute and relative throttling very explicit. Also, you may want to convert the units to something more convenient than bytes/second, such as kbps or kB/s, etc. """ # $Id: grabber.py,v 1.48 2006/09/22 00:58:05 mstenner Exp $ import os import os.path import sys import urlparse import rfc822 import time import string import urllib import urllib2 from stat import * # S_* and ST_* ######################################################################## # MODULE INITIALIZATION ######################################################################## try: exec('from ' + (__name__.split('.'))[0] + ' import __version__') except: __version__ = '???' import sslfactory auth_handler = urllib2.HTTPBasicAuthHandler( \ urllib2.HTTPPasswordMgrWithDefaultRealm()) try: from i18n import _ except ImportError, msg: def _(st): return st try: from httplib import HTTPException except ImportError, msg: HTTPException = None try: # This is a convenient way to make keepalive optional. # Just rename the module so it can't be imported. import keepalive from keepalive import HTTPHandler, HTTPSHandler have_keepalive = True except ImportError, msg: have_keepalive = False try: # add in range support conditionally too import byterange from byterange import HTTPRangeHandler, HTTPSRangeHandler, \ FileRangeHandler, FTPRangeHandler, range_tuple_normalize, \ range_tuple_to_header, RangeError except ImportError, msg: range_handlers = () RangeError = None have_range = 0 else: range_handlers = (HTTPRangeHandler(), HTTPSRangeHandler(), FileRangeHandler(), FTPRangeHandler()) have_range = 1 # check whether socket timeout support is available (Python >= 2.3) import socket try: TimeoutError = socket.timeout have_socket_timeout = True except AttributeError: TimeoutError = None have_socket_timeout = False ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. DEBUG = None def set_logger(DBOBJ): """Set the DEBUG object. This is called by _init_default_logger when the environment variable URLGRABBER_DEBUG is set, but can also be called by a calling program. Basically, if the calling program uses the logging module and would like to incorporate urlgrabber logging, then it can do so this way. It's probably not necessary as most internal logging is only for debugging purposes. The passed-in object should be a logging.Logger instance. It will be pushed into the keepalive and byterange modules if they're being used. The mirror module pulls this object in on import, so you will need to manually push into it. In fact, you may find it tidier to simply push your logging object (or objects) into each of these modules independently. """ global DEBUG DEBUG = DBOBJ if have_keepalive and keepalive.DEBUG is None: keepalive.DEBUG = DBOBJ if have_range and byterange.DEBUG is None: byterange.DEBUG = DBOBJ if sslfactory.DEBUG is None: sslfactory.DEBUG = DBOBJ def _init_default_logger(): '''Examines the environment variable URLGRABBER_DEBUG and creates a logging object (logging.logger) based on the contents. It takes the form URLGRABBER_DEBUG=level,filename where "level" can be either an integer or a log level from the logging module (DEBUG, INFO, etc). If the integer is zero or less, logging will be disabled. Filename is the filename where logs will be sent. If it is "-", then stdout will be used. If the filename is empty or missing, stderr will be used. If the variable cannot be processed or the logging module cannot be imported (python < 2.3) then logging will be disabled. Here are some examples: URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout URLGRABBER_DEBUG=INFO # log info and higher to stderr This funtion is called during module initialization. It is not intended to be called from outside. The only reason it is a function at all is to keep the module-level namespace tidy and to collect the code into a nice block.''' try: dbinfo = os.environ['URLGRABBER_DEBUG'].split(',') import logging level = logging._levelNames.get(dbinfo[0], int(dbinfo[0])) if level < 1: raise ValueError() formatter = logging.Formatter('%(asctime)s %(message)s') if len(dbinfo) > 1: filename = dbinfo[1] else: filename = '' if filename == '': handler = logging.StreamHandler(sys.stderr) elif filename == '-': handler = logging.StreamHandler(sys.stdout) else: handler = logging.FileHandler(filename) handler.setFormatter(formatter) DBOBJ = logging.getLogger('urlgrabber') DBOBJ.addHandler(handler) DBOBJ.setLevel(level) except (KeyError, ImportError, ValueError): DBOBJ = None set_logger(DBOBJ) _init_default_logger() ######################################################################## # END MODULE INITIALIZATION ######################################################################## class URLGrabError(IOError): """ URLGrabError error codes: URLGrabber error codes (0 -- 255) 0 - everything looks good (you should never see this) 1 - malformed url 2 - local file doesn't exist 3 - request for non-file local file (dir, etc) 4 - IOError on fetch 5 - OSError on fetch 6 - no content length header when we expected one 7 - HTTPException 8 - Exceeded read limit (for urlread) 9 - Requested byte range not satisfiable. 10 - Byte range requested, but range support unavailable 11 - Illegal reget mode 12 - Socket timeout 13 - malformed proxy url 14 - HTTPError (includes .code and .exception attributes) 15 - user abort MirrorGroup error codes (256 -- 511) 256 - No more mirrors left to try Custom (non-builtin) classes derived from MirrorGroup (512 -- 767) [ this range reserved for application-specific error codes ] Retry codes (< 0) -1 - retry the download, unknown reason Note: to test which group a code is in, you can simply do integer division by 256: e.errno / 256 Negative codes are reserved for use by functions passed in to retrygrab with checkfunc. The value -1 is built in as a generic retry code and is already included in the retrycodes list. Therefore, you can create a custom check function that simply returns -1 and the fetch will be re-tried. For more customized retries, you can use other negative number and include them in retry-codes. This is nice for outputting useful messages about what failed. You can use these error codes like so: try: urlgrab(url) except URLGrabError, e: if e.errno == 3: ... # or print e.strerror # or simply print e #### print '[Errno %i] %s' % (e.errno, e.strerror) """ pass class CallbackObject: """Container for returned callback data. This is currently a dummy class into which urlgrabber can stuff information for passing to callbacks. This way, the prototype for all callbacks is the same, regardless of the data that will be passed back. Any function that accepts a callback function as an argument SHOULD document what it will define in this object. It is possible that this class will have some greater functionality in the future. """ def __init__(self, **kwargs): self.__dict__.update(kwargs) def urlgrab(url, filename=None, **kwargs): """grab the file at and make a local copy at If filename is none, the basename of the url is used. urlgrab returns the filename of the local file, which may be different from the passed-in filename if the copy_local kwarg == 0. See module documentation for a description of possible kwargs. """ return default_grabber.urlgrab(url, filename, **kwargs) def urlopen(url, **kwargs): """open the url and return a file object If a progress object or throttle specifications exist, then a special file object will be returned that supports them. The file object can be treated like any other file object. See module documentation for a description of possible kwargs. """ return default_grabber.urlopen(url, **kwargs) def urlread(url, limit=None, **kwargs): """read the url into a string, up to 'limit' bytes If the limit is exceeded, an exception will be thrown. Note that urlread is NOT intended to be used as a way of saying "I want the first N bytes" but rather 'read the whole file into memory, but don't use too much' See module documentation for a description of possible kwargs. """ return default_grabber.urlread(url, limit, **kwargs) class URLParser: """Process the URLs before passing them to urllib2. This class does several things: * add any prefix * translate a "raw" file to a proper file: url * handle any http or https auth that's encoded within the url * quote the url Only the "parse" method is called directly, and it calls sub-methods. An instance of this class is held in the options object, which means that it's easy to change the behavior by sub-classing and passing the replacement in. It need only have a method like: url, parts = urlparser.parse(url, opts) """ def parse(self, url, opts): """parse the url and return the (modified) url and its parts Note: a raw file WILL be quoted when it's converted to a URL. However, other urls (ones which come with a proper scheme) may or may not be quoted according to opts.quote opts.quote = 1 --> quote it opts.quote = 0 --> do not quote it opts.quote = None --> guess """ quote = opts.quote if opts.prefix: url = self.add_prefix(url, opts.prefix) parts = urlparse.urlparse(url) (scheme, host, path, parm, query, frag) = parts if not scheme or (len(scheme) == 1 and scheme in string.letters): # if a scheme isn't specified, we guess that it's "file:" if url[0] not in '/\\': url = os.path.abspath(url) url = 'file:' + urllib.pathname2url(url) parts = urlparse.urlparse(url) quote = 0 # pathname2url quotes, so we won't do it again if scheme in ['http', 'https']: parts = self.process_http(parts) if quote is None: quote = self.guess_should_quote(parts) if quote: parts = self.quote(parts) url = urlparse.urlunparse(parts) return url, parts def add_prefix(self, url, prefix): if prefix[-1] == '/' or url[0] == '/': url = prefix + url else: url = prefix + '/' + url return url def process_http(self, parts): (scheme, host, path, parm, query, frag) = parts if '@' in host and auth_handler: try: user_pass, host = host.split('@', 1) if ':' in user_pass: user, password = user_pass.split(':', 1) except ValueError, e: raise URLGrabError(1, _('Bad URL: %s') % url) if DEBUG: DEBUG.info('adding HTTP auth: %s, %s', user, password) auth_handler.add_password(None, host, user, password) return (scheme, host, path, parm, query, frag) def quote(self, parts): """quote the URL This method quotes ONLY the path part. If you need to quote other parts, you should override this and pass in your derived class. The other alternative is to quote other parts before passing into urlgrabber. """ (scheme, host, path, parm, query, frag) = parts path = urllib.quote(path) return (scheme, host, path, parm, query, frag) hexvals = '0123456789ABCDEF' def guess_should_quote(self, parts): """ Guess whether we should quote a path. This amounts to guessing whether it's already quoted. find ' ' -> 1 find '%' -> 1 find '%XX' -> 0 else -> 1 """ (scheme, host, path, parm, query, frag) = parts if ' ' in path: return 1 ind = string.find(path, '%') if ind > -1: while ind > -1: if len(path) < ind+3: return 1 code = path[ind+1:ind+3].upper() if code[0] not in self.hexvals or \ code[1] not in self.hexvals: return 1 ind = string.find(path, '%', ind+1) return 0 return 1 class URLGrabberOptions: """Class to ease kwargs handling.""" def __init__(self, delegate=None, **kwargs): """Initialize URLGrabberOptions object. Set default values for all options and then update options specified in kwargs. """ self.delegate = delegate if delegate is None: self._set_defaults() self._set_attributes(**kwargs) def __getattr__(self, name): if self.delegate and hasattr(self.delegate, name): return getattr(self.delegate, name) raise AttributeError, name def raw_throttle(self): """Calculate raw throttle value from throttle and bandwidth values. """ if self.throttle <= 0: return 0 elif type(self.throttle) == type(0): return float(self.throttle) else: # throttle is a float return self.bandwidth * self.throttle def derive(self, **kwargs): """Create a derived URLGrabberOptions instance. This method creates a new instance and overrides the options specified in kwargs. """ return URLGrabberOptions(delegate=self, **kwargs) def _set_attributes(self, **kwargs): """Update object attributes with those provided in kwargs.""" self.__dict__.update(kwargs) if have_range and kwargs.has_key('range'): # normalize the supplied range value self.range = range_tuple_normalize(self.range) if not self.reget in [None, 'simple', 'check_timestamp']: raise URLGrabError(11, _('Illegal reget mode: %s') \ % (self.reget, )) def _set_defaults(self): """Set all options to their default values. When adding new options, make sure a default is provided here. """ self.progress_obj = None self.throttle = 1.0 self.bandwidth = 0 self.retry = None self.retrycodes = [-1,2,4,5,6,7] self.checkfunc = None self.copy_local = 0 self.close_connection = 0 self.range = None self.user_agent = 'urlgrabber/%s' % __version__ self.keepalive = 1 self.proxies = None self.reget = None self.failure_callback = None self.interrupt_callback = None self.prefix = None self.opener = None self.cache_openers = True self.timeout = None self.text = None self.http_headers = None self.ftp_headers = None self.data = None self.urlparser = URLParser() self.quote = None self.ssl_ca_cert = None self.ssl_context = None class URLGrabber: """Provides easy opening of URLs with a variety of options. All options are specified as kwargs. Options may be specified when the class is created and may be overridden on a per request basis. New objects inherit default values from default_grabber. """ def __init__(self, **kwargs): self.opts = URLGrabberOptions(**kwargs) def _retry(self, opts, func, *args): tries = 0 while 1: # there are only two ways out of this loop. The second has # several "sub-ways" # 1) via the return in the "try" block # 2) by some exception being raised # a) an excepton is raised that we don't "except" # b) a callback raises ANY exception # c) we're not retry-ing or have run out of retries # d) the URLGrabError code is not in retrycodes # beware of infinite loops :) tries = tries + 1 exception = None retrycode = None callback = None if DEBUG: DEBUG.info('attempt %i/%s: %s', tries, opts.retry, args[0]) try: r = apply(func, (opts,) + args, {}) if DEBUG: DEBUG.info('success') return r except URLGrabError, e: exception = e callback = opts.failure_callback retrycode = e.errno except KeyboardInterrupt, e: exception = e callback = opts.interrupt_callback if DEBUG: DEBUG.info('exception: %s', exception) if callback: if DEBUG: DEBUG.info('calling callback: %s', callback) cb_func, cb_args, cb_kwargs = self._make_callback(callback) obj = CallbackObject(exception=exception, url=args[0], tries=tries, retry=opts.retry) cb_func(obj, *cb_args, **cb_kwargs) if (opts.retry is None) or (tries == opts.retry): if DEBUG: DEBUG.info('retries exceeded, re-raising') raise if (retrycode is not None) and (retrycode not in opts.retrycodes): if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', retrycode, opts.retrycodes) raise def urlopen(self, url, **kwargs): """open the url and return a file object If a progress object or throttle value specified when this object was created, then a special file object will be returned that supports them. The file object can be treated like any other file object. """ opts = self.opts.derive(**kwargs) (url,parts) = opts.urlparser.parse(url, opts) def retryfunc(opts, url): return URLGrabberFileObject(url, filename=None, opts=opts) return self._retry(opts, retryfunc, url) def urlgrab(self, url, filename=None, **kwargs): """grab the file at and make a local copy at If filename is none, the basename of the url is used. urlgrab returns the filename of the local file, which may be different from the passed-in filename if copy_local == 0. """ opts = self.opts.derive(**kwargs) (url,parts) = opts.urlparser.parse(url, opts) (scheme, host, path, parm, query, frag) = parts if filename is None: filename = os.path.basename( urllib.unquote(path) ) if scheme == 'file' and not opts.copy_local: # just return the name of the local file - don't make a # copy currently path = urllib.url2pathname(path) if host: path = os.path.normpath('//' + host + path) if not os.path.exists(path): raise URLGrabError(2, _('Local file does not exist: %s') % (path, )) elif not os.path.isfile(path): raise URLGrabError(3, _('Not a normal file: %s') % (path, )) elif not opts.range: return path def retryfunc(opts, url, filename): fo = URLGrabberFileObject(url, filename, opts) try: fo._do_grab() if not opts.checkfunc is None: cb_func, cb_args, cb_kwargs = \ self._make_callback(opts.checkfunc) obj = CallbackObject() obj.filename = filename obj.url = url apply(cb_func, (obj, )+cb_args, cb_kwargs) finally: fo.close() return filename return self._retry(opts, retryfunc, url, filename) def urlread(self, url, limit=None, **kwargs): """read the url into a string, up to 'limit' bytes If the limit is exceeded, an exception will be thrown. Note that urlread is NOT intended to be used as a way of saying "I want the first N bytes" but rather 'read the whole file into memory, but don't use too much' """ opts = self.opts.derive(**kwargs) (url,parts) = opts.urlparser.parse(url, opts) if limit is not None: limit = limit + 1 def retryfunc(opts, url, limit): fo = URLGrabberFileObject(url, filename=None, opts=opts) s = '' try: # this is an unfortunate thing. Some file-like objects # have a default "limit" of None, while the built-in (real) # file objects have -1. They each break the other, so for # now, we just force the default if necessary. if limit is None: s = fo.read() else: s = fo.read(limit) if not opts.checkfunc is None: cb_func, cb_args, cb_kwargs = \ self._make_callback(opts.checkfunc) obj = CallbackObject() obj.data = s obj.url = url apply(cb_func, (obj, )+cb_args, cb_kwargs) finally: fo.close() return s s = self._retry(opts, retryfunc, url, limit) if limit and len(s) > limit: raise URLGrabError(8, _('Exceeded limit (%i): %s') % (limit, url)) return s def _make_callback(self, callback_obj): if callable(callback_obj): return callback_obj, (), {} else: return callback_obj # create the default URLGrabber used by urlXXX functions. # NOTE: actual defaults are set in URLGrabberOptions default_grabber = URLGrabber() class URLGrabberFileObject: """This is a file-object wrapper that supports progress objects and throttling. This exists to solve the following problem: lets say you want to drop-in replace a normal open with urlopen. You want to use a progress meter and/or throttling, but how do you do that without rewriting your code? Answer: urlopen will return a wrapped file object that does the progress meter and-or throttling internally. """ def __init__(self, url, filename, opts): self.url = url self.filename = filename self.opts = opts self.fo = None self._rbuf = '' self._rbufsize = 1024*8 self._ttime = time.time() self._tsize = 0 self._amount_read = 0 self._opener = None self._do_open() def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for in self.fo. This includes methods.""" if hasattr(self.fo, name): return getattr(self.fo, name) raise AttributeError, name def _get_opener(self): """Build a urllib2 OpenerDirector based on request options.""" if self.opts.opener: return self.opts.opener elif self._opener is None: handlers = [] need_keepalive_handler = (have_keepalive and self.opts.keepalive) need_range_handler = (range_handlers and \ (self.opts.range or self.opts.reget)) # if you specify a ProxyHandler when creating the opener # it _must_ come before all other handlers in the list or urllib2 # chokes. if self.opts.proxies: handlers.append( CachedProxyHandler(self.opts.proxies) ) # ------------------------------------------------------- # OK, these next few lines are a serious kludge to get # around what I think is a bug in python 2.2's # urllib2. The basic idea is that default handlers # get applied first. If you override one (like a # proxy handler), then the default gets pulled, but # the replacement goes on the end. In the case of # proxies, this means the normal handler picks it up # first and the proxy isn't used. Now, this probably # only happened with ftp or non-keepalive http, so not # many folks saw it. The simple approach to fixing it # is just to make sure you override the other # conflicting defaults as well. I would LOVE to see # these go way or be dealt with more elegantly. The # problem isn't there after 2.2. -MDS 2005/02/24 if not need_keepalive_handler: handlers.append( urllib2.HTTPHandler() ) if not need_range_handler: handlers.append( urllib2.FTPHandler() ) # ------------------------------------------------------- ssl_factory = sslfactory.get_factory(self.opts.ssl_ca_cert, self.opts.ssl_context) if need_keepalive_handler: handlers.append(HTTPHandler()) handlers.append(HTTPSHandler(ssl_factory)) if need_range_handler: handlers.extend( range_handlers ) handlers.append( auth_handler ) if self.opts.cache_openers: self._opener = CachedOpenerDirector(ssl_factory, *handlers) else: self._opener = ssl_factory.create_opener(*handlers) # OK, I don't like to do this, but otherwise, we end up with # TWO user-agent headers. self._opener.addheaders = [] return self._opener def _do_open(self): opener = self._get_opener() req = urllib2.Request(self.url, self.opts.data) # build request object self._add_headers(req) # add misc headers that we need self._build_range(req) # take care of reget and byterange stuff fo, hdr = self._make_request(req, opener) if self.reget_time and self.opts.reget == 'check_timestamp': # do this if we have a local file with known timestamp AND # we're in check_timestamp reget mode. fetch_again = 0 try: modified_tuple = hdr.getdate_tz('last-modified') modified_stamp = rfc822.mktime_tz(modified_tuple) if modified_stamp > self.reget_time: fetch_again = 1 except (TypeError,): fetch_again = 1 if fetch_again: # the server version is newer than the (incomplete) local # version, so we should abandon the version we're getting # and fetch the whole thing again. fo.close() self.opts.reget = None del req.headers['Range'] self._build_range(req) fo, hdr = self._make_request(req, opener) (scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url) path = urllib.unquote(path) if not (self.opts.progress_obj or self.opts.raw_throttle() \ or self.opts.timeout): # if we're not using the progress_obj, throttling, or timeout # we can get a performance boost by going directly to # the underlying fileobject for reads. self.read = fo.read if hasattr(fo, 'readline'): self.readline = fo.readline elif self.opts.progress_obj: try: length = int(hdr['Content-Length']) length = length + self._amount_read # Account for regets except (KeyError, ValueError, TypeError): length = None self.opts.progress_obj.start(str(self.filename), urllib.unquote(self.url), os.path.basename(path), length, text=self.opts.text) self.opts.progress_obj.update(0) (self.fo, self.hdr) = (fo, hdr) def _add_headers(self, req): if self.opts.user_agent: req.add_header('User-agent', self.opts.user_agent) try: req_type = req.get_type() except ValueError: req_type = None if self.opts.http_headers and req_type in ('http', 'https'): for h, v in self.opts.http_headers: req.add_header(h, v) if self.opts.ftp_headers and req_type == 'ftp': for h, v in self.opts.ftp_headers: req.add_header(h, v) def _build_range(self, req): self.reget_time = None self.append = 0 reget_length = 0 rt = None if have_range and self.opts.reget and type(self.filename) == type(''): # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) except OSError: pass else: self.reget_time = s[ST_MTIME] reget_length = s[ST_SIZE] # Set initial length when regetting self._amount_read = reget_length rt = reget_length, '' self.append = 1 if self.opts.range: if not have_range: raise URLGrabError(10, _('Byte range requested but range '\ 'support unavailable')) rt = self.opts.range if rt[0]: rt = (rt[0] + reget_length, rt[1]) if rt: header = range_tuple_to_header(rt) if header: req.add_header('Range', header) def _make_request(self, req, opener): try: if have_socket_timeout and self.opts.timeout: old_to = socket.getdefaulttimeout() socket.setdefaulttimeout(self.opts.timeout) try: fo = opener.open(req) finally: socket.setdefaulttimeout(old_to) else: fo = opener.open(req) hdr = fo.info() except ValueError, e: raise URLGrabError(1, _('Bad URL: %s') % (e, )) except RangeError, e: raise URLGrabError(9, str(e)) except urllib2.HTTPError, e: new_e = URLGrabError(14, str(e)) new_e.code = e.code new_e.exception = e raise new_e except IOError, e: if hasattr(e, 'reason') and have_socket_timeout and \ isinstance(e.reason, TimeoutError): raise URLGrabError(12, _('Timeout: %s') % (e, )) else: raise URLGrabError(4, _('IOError: %s') % (e, )) except OSError, e: raise URLGrabError(5, _('OSError: %s') % (e, )) except HTTPException, e: raise URLGrabError(7, _('HTTP Exception (%s): %s') % \ (e.__class__.__name__, e)) else: return (fo, hdr) def _do_grab(self): """dump the file to self.filename.""" if self.append: new_fo = open(self.filename, 'ab') else: new_fo = open(self.filename, 'wb') bs = 1024*8 size = 0 block = self.read(bs) size = size + len(block) while block: new_fo.write(block) block = self.read(bs) size = size + len(block) new_fo.close() try: modified_tuple = self.hdr.getdate_tz('last-modified') modified_stamp = rfc822.mktime_tz(modified_tuple) os.utime(self.filename, (modified_stamp, modified_stamp)) except (TypeError,), e: pass return size def _fill_buffer(self, amt=None): """fill the buffer to contain at least 'amt' bytes by reading from the underlying file object. If amt is None, then it will read until it gets nothing more. It updates the progress meter and throttles after every self._rbufsize bytes.""" # the _rbuf test is only in this first 'if' for speed. It's not # logically necessary if self._rbuf and not amt is None: L = len(self._rbuf) if amt > L: amt = amt - L else: return # if we've made it here, then we don't have enough in the buffer # and we need to read more. buf = [self._rbuf] bufsize = len(self._rbuf) while amt is None or amt: # first, delay if necessary for throttling reasons if self.opts.raw_throttle(): diff = self._tsize/self.opts.raw_throttle() - \ (time.time() - self._ttime) if diff > 0: time.sleep(diff) self._ttime = time.time() # now read some data, up to self._rbufsize if amt is None: readamount = self._rbufsize else: readamount = min(amt, self._rbufsize) try: new = self.fo.read(readamount) except socket.error, e: raise URLGrabError(4, _('Socket Error: %s') % (e, )) except TimeoutError, e: raise URLGrabError(12, _('Timeout: %s') % (e, )) except IOError, e: raise URLGrabError(4, _('IOError: %s') %(e,)) newsize = len(new) if not newsize: break # no more to read if amt: amt = amt - newsize buf.append(new) bufsize = bufsize + newsize self._tsize = newsize self._amount_read = self._amount_read + newsize if self.opts.progress_obj: self.opts.progress_obj.update(self._amount_read) self._rbuf = string.join(buf, '') return def read(self, amt=None): self._fill_buffer(amt) if amt is None: s, self._rbuf = self._rbuf, '' else: s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:] return s def readline(self, limit=-1): i = string.find(self._rbuf, '\n') while i < 0 and not (0 < limit <= len(self._rbuf)): L = len(self._rbuf) self._fill_buffer(L + self._rbufsize) if not len(self._rbuf) > L: break i = string.find(self._rbuf, '\n', L) if i < 0: i = len(self._rbuf) else: i = i+1 if 0 <= limit < len(self._rbuf): i = limit s, self._rbuf = self._rbuf[:i], self._rbuf[i:] return s def close(self): if self.opts.progress_obj: self.opts.progress_obj.end(self._amount_read) self.fo.close() if self.opts.close_connection: try: self.fo.close_connection() except: pass _handler_cache = [] def CachedOpenerDirector(ssl_factory = None, *handlers): for (cached_handlers, opener) in _handler_cache: if cached_handlers == handlers: for handler in opener.handlers: handler.add_parent(opener) return opener if not ssl_factory: ssl_factory = sslfactory.get_factory() opener = ssl_factory.create_opener(*handlers) _handler_cache.append( (handlers, opener) ) return opener _proxy_cache = [] def CachedProxyHandler(proxies): for (pdict, handler) in _proxy_cache: if pdict == proxies: if DEBUG: DEBUG.debug('re-using proxy settings: %s', proxies) break else: for k, v in proxies.items(): utype, url = urllib.splittype(v) host, other = urllib.splithost(url) if (utype is None) or (host is None): raise URLGrabError(13, _('Bad proxy URL: %s') % v) if DEBUG: DEBUG.info('creating new proxy handler: %s', proxies) handler = urllib2.ProxyHandler(proxies) _proxy_cache.append( (proxies, handler) ) return handler ##################################################################### # DEPRECATED FUNCTIONS def set_throttle(new_throttle): """Deprecated. Use: default_grabber.throttle = new_throttle""" default_grabber.throttle = new_throttle def set_bandwidth(new_bandwidth): """Deprecated. Use: default_grabber.bandwidth = new_bandwidth""" default_grabber.bandwidth = new_bandwidth def set_progress_obj(new_progress_obj): """Deprecated. Use: default_grabber.progress_obj = new_progress_obj""" default_grabber.progress_obj = new_progress_obj def set_user_agent(new_user_agent): """Deprecated. Use: default_grabber.user_agent = new_user_agent""" default_grabber.user_agent = new_user_agent def retrygrab(url, filename=None, copy_local=0, close_connection=0, progress_obj=None, throttle=None, bandwidth=None, numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None): """Deprecated. Use: urlgrab() with the retry arg instead""" kwargs = {'copy_local' : copy_local, 'close_connection' : close_connection, 'progress_obj' : progress_obj, 'throttle' : throttle, 'bandwidth' : bandwidth, 'retry' : numtries, 'retrycodes' : retrycodes, 'checkfunc' : checkfunc } return urlgrab(url, filename, **kwargs) ##################################################################### # TESTING def _main_test(): import sys try: url, filename = sys.argv[1:3] except ValueError: print 'usage:', sys.argv[0], \ ' [copy_local=0|1] [close_connection=0|1]' sys.exit() kwargs = {} for a in sys.argv[3:]: k, v = string.split(a, '=', 1) kwargs[k] = int(v) set_throttle(1.0) set_bandwidth(32 * 1024) print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle, default_grabber.bandwidth) try: from progress import text_progress_meter except ImportError, e: pass else: kwargs['progress_obj'] = text_progress_meter() try: name = apply(urlgrab, (url, filename), kwargs) except URLGrabError, e: print e else: print 'LOCAL FILE:', name def _retry_test(): import sys try: url, filename = sys.argv[1:3] except ValueError: print 'usage:', sys.argv[0], \ ' [copy_local=0|1] [close_connection=0|1]' sys.exit() kwargs = {} for a in sys.argv[3:]: k, v = string.split(a, '=', 1) kwargs[k] = int(v) try: from progress import text_progress_meter except ImportError, e: pass else: kwargs['progress_obj'] = text_progress_meter() def cfunc(filename, hello, there='foo'): print hello, there import random rnum = random.random() if rnum < .5: print 'forcing retry' raise URLGrabError(-1, 'forcing retry') if rnum < .75: print 'forcing failure' raise URLGrabError(-2, 'forcing immediate failure') print 'success' return kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'}) try: name = apply(retrygrab, (url, filename), kwargs) except URLGrabError, e: print e else: print 'LOCAL FILE:', name def _file_object_test(filename=None): import random, cStringIO, sys if filename is None: filename = __file__ print 'using file "%s" for comparisons' % filename fo = open(filename) s_input = fo.read() fo.close() for testfunc in [_test_file_object_smallread, _test_file_object_readall, _test_file_object_readline, _test_file_object_readlines]: fo_input = cStringIO.StringIO(s_input) fo_output = cStringIO.StringIO() wrapper = URLGrabberFileObject(fo_input, None, 0) print 'testing %-30s ' % testfunc.__name__, testfunc(wrapper, fo_output) s_output = fo_output.getvalue() if s_output == s_input: print 'passed' else: print 'FAILED' def _test_file_object_smallread(wrapper, fo_output): while 1: s = wrapper.read(23) fo_output.write(s) if not s: return def _test_file_object_readall(wrapper, fo_output): s = wrapper.read() fo_output.write(s) def _test_file_object_readline(wrapper, fo_output): while 1: s = wrapper.readline() fo_output.write(s) if not s: return def _test_file_object_readlines(wrapper, fo_output): li = wrapper.readlines() fo_output.write(string.join(li, '')) if __name__ == '__main__': _main_test() _retry_test() _file_object_test('test') 0707010002bc0b000081a40000000000000002000000014306562b0000473b000000b600010006ffffffffffffffff0000003f00000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/progress.py# This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., # 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko # $Id: progress.py,v 1.7 2005/08/19 21:59:07 mstenner Exp $ import sys import time import math import thread class BaseMeter: def __init__(self): self.update_period = 0.3 # seconds self.filename = None self.url = None self.basename = None self.text = None self.size = None self.start_time = None self.last_amount_read = 0 self.last_update_time = None self.re = RateEstimator() def start(self, filename=None, url=None, basename=None, size=None, now=None, text=None): self.filename = filename self.url = url self.basename = basename self.text = text #size = None ######### TESTING self.size = size if not size is None: self.fsize = format_number(size) + 'B' if now is None: now = time.time() self.start_time = now self.re.start(size, now) self.last_amount_read = 0 self.last_update_time = now self._do_start(now) def _do_start(self, now=None): pass def update(self, amount_read, now=None): # for a real gui, you probably want to override and put a call # to your mainloop iteration function here if now is None: now = time.time() if (now >= self.last_update_time + self.update_period) or \ not self.last_update_time: self.re.update(amount_read, now) self.last_amount_read = amount_read self.last_update_time = now self._do_update(amount_read, now) def _do_update(self, amount_read, now=None): pass def end(self, amount_read, now=None): if now is None: now = time.time() self.re.update(amount_read, now) self.last_amount_read = amount_read self.last_update_time = now self._do_end(amount_read, now) def _do_end(self, amount_read, now=None): pass class TextMeter(BaseMeter): def __init__(self, fo=sys.stderr): BaseMeter.__init__(self) self.fo = fo def _do_update(self, amount_read, now=None): etime = self.re.elapsed_time() fetime = format_time(etime) fread = format_number(amount_read) #self.size = None if self.text is not None: text = self.text else: text = self.basename if self.size is None: out = '\r%-60.60s %5sB %s ' % \ (text, fread, fetime) else: rtime = self.re.remaining_time() frtime = format_time(rtime) frac = self.re.fraction_read() bar = '='*int(25 * frac) out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ETA ' % \ (text, frac*100, bar, fread, frtime) self.fo.write(out) self.fo.flush() def _do_end(self, amount_read, now=None): total_time = format_time(self.re.elapsed_time()) total_size = format_number(amount_read) if self.text is not None: text = self.text else: text = self.basename if self.size is None: out = '\r%-60.60s %5sB %s ' % \ (text, total_size, total_time) else: bar = '='*25 out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ' % \ (text, 100, bar, total_size, total_time) self.fo.write(out + '\n') self.fo.flush() text_progress_meter = TextMeter class MultiFileHelper(BaseMeter): def __init__(self, master): BaseMeter.__init__(self) self.master = master def _do_start(self, now): self.master.start_meter(self, now) def _do_update(self, amount_read, now): # elapsed time since last update self.master.update_meter(self, now) def _do_end(self, amount_read, now): self.ftotal_time = format_time(now - self.start_time) self.ftotal_size = format_number(self.last_amount_read) self.master.end_meter(self, now) def failure(self, message, now=None): self.master.failure_meter(self, message, now) def message(self, message): self.master.message_meter(self, message) class MultiFileMeter: helperclass = MultiFileHelper def __init__(self): self.meters = [] self.in_progress_meters = [] self._lock = thread.allocate_lock() self.update_period = 0.3 # seconds self.numfiles = None self.finished_files = 0 self.failed_files = 0 self.open_files = 0 self.total_size = None self.failed_size = 0 self.start_time = None self.finished_file_size = 0 self.last_update_time = None self.re = RateEstimator() def start(self, numfiles=None, total_size=None, now=None): if now is None: now = time.time() self.numfiles = numfiles self.finished_files = 0 self.failed_files = 0 self.open_files = 0 self.total_size = total_size self.failed_size = 0 self.start_time = now self.finished_file_size = 0 self.last_update_time = now self.re.start(total_size, now) self._do_start(now) def _do_start(self, now): pass def end(self, now=None): if now is None: now = time.time() self._do_end(now) def _do_end(self, now): pass def lock(self): self._lock.acquire() def unlock(self): self._lock.release() ########################################################### # child meter creation and destruction def newMeter(self): newmeter = self.helperclass(self) self.meters.append(newmeter) return newmeter def removeMeter(self, meter): self.meters.remove(meter) ########################################################### # child functions - these should only be called by helpers def start_meter(self, meter, now): if not meter in self.meters: raise ValueError('attempt to use orphaned meter') self._lock.acquire() try: if not meter in self.in_progress_meters: self.in_progress_meters.append(meter) self.open_files += 1 finally: self._lock.release() self._do_start_meter(meter, now) def _do_start_meter(self, meter, now): pass def update_meter(self, meter, now): if not meter in self.meters: raise ValueError('attempt to use orphaned meter') if (now >= self.last_update_time + self.update_period) or \ not self.last_update_time: self.re.update(self._amount_read(), now) self.last_update_time = now self._do_update_meter(meter, now) def _do_update_meter(self, meter, now): pass def end_meter(self, meter, now): if not meter in self.meters: raise ValueError('attempt to use orphaned meter') self._lock.acquire() try: try: self.in_progress_meters.remove(meter) except ValueError: pass self.open_files -= 1 self.finished_files += 1 self.finished_file_size += meter.last_amount_read finally: self._lock.release() self._do_end_meter(meter, now) def _do_end_meter(self, meter, now): pass def failure_meter(self, meter, message, now): if not meter in self.meters: raise ValueError('attempt to use orphaned meter') self._lock.acquire() try: try: self.in_progress_meters.remove(meter) except ValueError: pass self.open_files -= 1 self.failed_files += 1 if meter.size and self.failed_size is not None: self.failed_size += meter.size else: self.failed_size = None finally: self._lock.release() self._do_failure_meter(meter, message, now) def _do_failure_meter(self, meter, message, now): pass def message_meter(self, meter, message): pass ######################################################## # internal functions def _amount_read(self): tot = self.finished_file_size for m in self.in_progress_meters: tot += m.last_amount_read return tot class TextMultiFileMeter(MultiFileMeter): def __init__(self, fo=sys.stderr): self.fo = fo MultiFileMeter.__init__(self) # files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:## def _do_update_meter(self, meter, now): self._lock.acquire() try: format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \ "time: %8.8s/%8.8s" df = self.finished_files tf = self.numfiles or 1 pf = 100 * float(df)/tf + 0.49 dd = self.re.last_amount_read td = self.total_size pd = 100 * (self.re.fraction_read() or 0) + 0.49 dt = self.re.elapsed_time() rt = self.re.remaining_time() if rt is None: tt = None else: tt = dt + rt fdd = format_number(dd) + 'B' ftd = format_number(td) + 'B' fdt = format_time(dt, 1) ftt = format_time(tt, 1) out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt)) self.fo.write('\r' + out) self.fo.flush() finally: self._lock.release() def _do_end_meter(self, meter, now): self._lock.acquire() try: format = "%-30.30s %6.6s %8.8s %9.9s" fn = meter.basename size = meter.last_amount_read fsize = format_number(size) + 'B' et = meter.re.elapsed_time() fet = format_time(et, 1) frate = format_number(size / et) + 'B/s' out = '%-79.79s' % (format % (fn, fsize, fet, frate)) self.fo.write('\r' + out + '\n') finally: self._lock.release() self._do_update_meter(meter, now) def _do_failure_meter(self, meter, message, now): self._lock.acquire() try: format = "%-30.30s %6.6s %s" fn = meter.basename if type(message) in (type(''), type(u'')): message = message.splitlines() if not message: message = [''] out = '%-79s' % (format % (fn, 'FAILED', message[0] or '')) self.fo.write('\r' + out + '\n') for m in message[1:]: self.fo.write(' ' + m + '\n') self._lock.release() finally: self._do_update_meter(meter, now) def message_meter(self, meter, message): self._lock.acquire() try: pass finally: self._lock.release() def _do_end(self, now): self._do_update_meter(None, now) self._lock.acquire() try: self.fo.write('\n') self.fo.flush() finally: self._lock.release() ###################################################################### # support classes and functions class RateEstimator: def __init__(self, timescale=5.0): self.timescale = timescale def start(self, total=None, now=None): if now is None: now = time.time() self.total = total self.start_time = now self.last_update_time = now self.last_amount_read = 0 self.ave_rate = None def update(self, amount_read, now=None): if now is None: now = time.time() if amount_read == 0: # if we just started this file, all bets are off self.last_update_time = now self.last_amount_read = 0 self.ave_rate = None return #print 'times', now, self.last_update_time time_diff = now - self.last_update_time read_diff = amount_read - self.last_amount_read self.last_update_time = now self.last_amount_read = amount_read self.ave_rate = self._temporal_rolling_ave(\ time_diff, read_diff, self.ave_rate, self.timescale) #print 'results', time_diff, read_diff, self.ave_rate ##################################################################### # result methods def average_rate(self): "get the average transfer rate (in bytes/second)" return self.ave_rate def elapsed_time(self): "the time between the start of the transfer and the most recent update" return self.last_update_time - self.start_time def remaining_time(self): "estimated time remaining" if not self.ave_rate or not self.total: return None return (self.total - self.last_amount_read) / self.ave_rate def fraction_read(self): """the fraction of the data that has been read (can be None for unknown transfer size)""" if self.total is None: return None elif self.total == 0: return 1.0 else: return float(self.last_amount_read)/self.total ######################################################################### # support methods def _temporal_rolling_ave(self, time_diff, read_diff, last_ave, timescale): """a temporal rolling average performs smooth averaging even when updates come at irregular intervals. This is performed by scaling the "epsilon" according to the time since the last update. Specifically, epsilon = time_diff / timescale As a general rule, the average will take on a completely new value after 'timescale' seconds.""" epsilon = time_diff / timescale if epsilon > 1: epsilon = 1.0 return self._rolling_ave(time_diff, read_diff, last_ave, epsilon) def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon): """perform a "rolling average" iteration a rolling average "folds" new data into an existing average with some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive) a value of 0.0 means only the old value (initial value) counts, and a value of 1.0 means only the newest value is considered.""" try: recent_rate = read_diff / time_diff except ZeroDivisionError: recent_rate = None if last_ave is None: return recent_rate elif recent_rate is None: return last_ave # at this point, both last_ave and recent_rate are numbers return epsilon * recent_rate + (1 - epsilon) * last_ave def _round_remaining_time(self, rt, start_time=15.0): """round the remaining time, depending on its size If rt is between n*start_time and (n+1)*start_time round downward to the nearest multiple of n (for any counting number n). If rt < start_time, round down to the nearest 1. For example (for start_time = 15.0): 2.7 -> 2.0 25.2 -> 25.0 26.4 -> 26.0 35.3 -> 34.0 63.6 -> 60.0 """ if rt < 0: return 0.0 shift = int(math.log(rt/start_time)/math.log(2)) rt = int(rt) if shift <= 0: return rt return float(int(rt) >> shift << shift) def format_time(seconds, use_hours=0): if seconds is None or seconds < 0: if use_hours: return '--:--:--' else: return '--:--' else: seconds = int(seconds) minutes = seconds / 60 seconds = seconds % 60 if use_hours: hours = minutes / 60 minutes = minutes % 60 return '%02i:%02i:%02i' % (hours, minutes, seconds) else: return '%02i:%02i' % (minutes, seconds) def format_number(number, SI=0, space=' '): """Turn numbers into human-readable metric-like numbers""" symbols = ['', # (none) 'k', # kilo 'M', # mega 'G', # giga 'T', # tera 'P', # peta 'E', # exa 'Z', # zetta 'Y'] # yotta if SI: step = 1000.0 else: step = 1024.0 thresh = 999 depth = 0 max_depth = len(symbols) - 1 # we want numbers between 0 and thresh, but don't exceed the length # of our list. In that event, the formatting will be screwed up, # but it'll still show the right number. while number > thresh and depth < max_depth: depth = depth + 1 number = number / step if type(number) == type(1) or type(number) == type(1L): # it's an int or a long, which means it didn't get divided, # which means it's already short enough format = '%i%s%s' elif number < 9.95: # must use 9.95 for proper sizing. For example, 9.99 will be # rounded to 10.0 with the .1f format string (which is too long) format = '%.1f%s%s' else: format = '%.0f%s%s' return(format % (float(number or 0), space, symbols[depth])) 0707010002bc0c000081a40000000000000002000000014ae194ad0000429f000000b600010006ffffffffffffffff0000004000000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/progress.pycm +VCc@sdkZdkZdkZdkZdfdYZdefdYZeZdefdYZdfdYZd efd YZ d fd YZ d dZ d ddZ dS(Nt BaseMetercBshtZdZdddddddZddZddZddZddZddZ RS(NcCsad|_d|_d|_d|_d|_d|_d|_d|_ d|_ t |_ dS(Nf0.29999999999999999i( tselft update_periodtNonetfilenameturltbasenamettexttsizet start_timetlast_amount_readtlast_update_timet RateEstimatortre(R((t_/export/xvm-gate/xvm-3.4///proto/install/usr/lib/python2.4/site-packages/urlgrabber/progress.pyt__init__s         cCs||_||_||_||_||_|dj ot|d|_n|djot i }n||_ |i i ||d|_||_|i|dS(NtBi(RRRRRRRt format_numbertfsizetnowttimeR R tstartR R t _do_start(RRRRRRR((RR)s          cCsdS(N((RR((RR;scCs||djoti}n||i|ijp |i o9|ii||||_ ||_|i ||ndS(N( RRRRR RR tupdatet amount_readR t _do_update(RRR((RR>s "  cCsdS(N((RRR((RRIscCsV|djoti}n|ii||||_||_|i ||dS(N( RRRRR RRR R t_do_end(RRR((RtendLs    cCsdS(N((RRR((RRSs( t__name__t __module__RRRRRRRR(((RRs    t TextMetercBs/tZeidZddZddZRS(NcCsti|||_dS(N(RRRtfo(RR((RRWs c Cs|ii}t|} t|}|i dj o |i }n |i }|i djod||| f} n\|ii}t|}|ii}dtd|} d||d| ||f} |ii| |iidS(Ns %-60.60s %5sB %s t=is( %-25.25s %3i%% |%-25.25s| %5sB %8s ETA id(RR t elapsed_timetetimet format_timetfetimeRRtfreadRRRRtouttremaining_timetrtimetfrtimet fraction_readtfractinttbarRtwritetflush( RRRR(R+RR%R"R)R-R$R&((RR[s     cCst|ii}t|}|idj o |i}n |i }|i djod|||f}n$dd}d|d|||f}|ii|d|iidS(Ns %-60.60s %5sB %s R is( %-25.25s %3i%% |%-25.25s| %5sB %8s ids (R#RR R!t total_timeRRt total_sizeRRRRR&R-RR.R/(RRRR0R-R1RR&((RRss    (RRtsyststderrRRRR(((RRVs tMultiFileHelpercBsAtZdZdZdZdZddZdZRS(NcCsti|||_dS(N(RRRtmaster(RR5((RRs cCs|ii||dS(N(RR5t start_meterR(RR((RRscCs|ii||dS(N(RR5t update_meterR(RRR((RRscCs?t||i|_t|i|_|ii ||dS(N( R#RRR t ftotal_timeRR t ftotal_sizeR5t end_meter(RRR((RRscCs|ii|||dS(N(RR5t failure_metertmessageR(RR<R((RtfailurescCs|ii||dS(N(RR5t message_meterR<(RR<((RR<s( RRRRRRRR=R<(((RR4s      tMultiFileMetercBstZeZdZddddZdZddZdZ dZ dZ dZ d Z d Zd Zd Zd ZdZdZdZdZdZdZRS(NcCsg|_g|_ti|_d|_d|_d|_ d|_ d|_ d|_ d|_ d|_d|_d|_t|_dS(Nf0.29999999999999999i(Rtmeterstin_progress_meterstthreadt allocate_lockt_lockRRtnumfilestfinished_filest failed_filest open_filesR1t failed_sizeR tfinished_file_sizeR R R (R((RRs            cCs|djoti}n||_d|_d|_d|_||_d|_ ||_ d|_ ||_ |i i|||i|dS(Ni(RRRRERRFRGRHR1RIR RJR R RR(RRER1R((RRs          cCsdS(N((RR((RRscCs.|djoti}n|i|dS(N(RRRRR(RR((RRs cCsdS(N((RR((RRscCs|iidS(N(RRDtacquire(R((RtlockscCs|iidS(N(RRDtrelease(R((RtunlockscCs#|i|}|ii||S(N(Rt helperclasstnewmeterR@tappend(RRP((RtnewMeterscCs|ii|dS(N(RR@tremovetmeter(RRT((Rt removeMeterscCs||ijotdn|iiz7||ijo#|ii||id7_nWd|ii X|i ||dS(Nsattempt to use orphaned meteri( RTRR@t ValueErrorRDRKRARQRHRMt_do_start_meterR(RRTR((RR6s cCsdS(N((RRTR((RRWscCs|||ijotdn||i|ijp |i o6|ii|i |||_|i ||ndS(Nsattempt to use orphaned meter( RTRR@RVRR RR Rt _amount_readt_do_update_meter(RRTR((RR7s " cCsdS(N((RRTR((RRYscCs||ijotdn|iiz^y|ii|Wntj onX|id8_|i d7_ |i |i 7_ Wd|ii X|i ||dS(Nsattempt to use orphaned meteri(RTRR@RVRDRKRARSRHRFRJR RMt _do_end_meterR(RRTR((RR:s cCsdS(N((RRTR((RRZscCs||ijotdn|iizy|ii|Wntj onX|id8_|i d7_ |i o&|i dj o|i |i 7_ n d|_ Wd|ii X|i|||dS(Nsattempt to use orphaned meteri(RTRR@RVRDRKRARSRHRGRRIRRMt_do_failure_meterR<R(RRTR<R((RR;s   cCsdS(N((RRTR<R((RR[scCsdS(N((RRTR<((RR>scCs.|i}x|iD]}||i7}qW|S(N(RRJttotRAtmR (RR]R\((RRXs   (RRR4RORRRRRRRLRNRRRUR6RWR7RYR:RZR;R[R>RX(((RR?s(               tTextMultiFileMetercBsDtZeidZdZdZdZdZdZ RS(NcCs||_ti|dS(N(RRR?R(RR((RR$s c CsT|iiz2d} |i} |ipd} dt| | d}|i i } |i } d|i ipdd}|i i}|i i}|djo d}n ||}t| d}t| d}t|d}t|d}d| | | ||||||f}|iid||ii Wd|ii!XdS( NsDfiles: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% time: %8.8s/%8.8siidf0.48999999999999999iRs%-79.79ss ("RRDRKtformatRFtdfREttftfloattpfR R tddR1ttdR*tpdR!tdtR'trtRtttRtfddtftdR#tfdttfttR&RR.R/RM(RRTRR&RhRmRcRkRfRaReRdR_R`RgRjRlRi((RRY)s.      &c Cs|iizd}|i} |i} t | d}|i i }t|d}t | |d}d|| |||f}|iid|dWd|iiX|i||dS(Ns %-30.30s %6.6s %8.8s %9.9sRisB/ss%-79.79ss s (RRDRKR_RTRtfnR RRRR R!tetR#tfettfrateR&RR.RMRYR( RRTRRqR_RRpRoR&RnR((RRZDs   cCs|iizd}|i}t|tdtdfjo|i }n|p dg}nd||d|dpdf}|i i d|dx*|d D]}|i i d |dqW|iiWd|i||XdS( Ns%-30.30s %6.6s %stus%-79stFAILEDis s is (RRDRKR_RTRRnttypeR<t splitlinesR&RR.R]RMRYR(RRTR<RR_R]RnR&((RR[Us   % " cCs&|iizWd|iiXdS(N(RRDRKRM(RRTR<((RR>ds  cCsS|id||iiz!|iid|iiWd|ii XdS(Ns ( RRYRRRDRKRR.R/RM(RR((RRks ( RRR2R3RRYRZR[R>R(((RR^#s     R cBsqtZddZdddZddZdZdZdZdZ d Z d Z d d Z RS( Nf5.0cCs ||_dS(N(t timescaleR(RRv((RRxscCsN|djoti}n||_||_||_d|_d|_dS(Ni( RRRttotalRR R R tave_rate(RRwR((RR{s     cCs|djoti}n|djo#||_d|_d|_dSn||i}||i}||_||_|i |||i|i |_dS(Ni( RRRRRR R Rxt time_difft read_difft_temporal_rolling_aveRv(RRRRzRy((RRs         cCs|iS(s/get the average transfer rate (in bytes/second)N(RRx(R((Rt average_ratescCs|i|iS(sEthe time between the start of the transfer and the most recent updateN(RR R (R((RR!scCs3|i p |i odSn|i|i|iS(sestimated time remainingN(RRxRwRR (R((RR'scCsH|idjodSn-|idjodSnt|i|iSdS(s[the fraction of the data that has been read (can be None for unknown transfer size)if1.0N(RRwRRbR (R((RR*s cCs7||}|djo d}n|i||||S(sqa temporal rolling average performs smooth averaging even when updates come at irregular intervals. This is performed by scaling the "epsilon" according to the time since the last update. Specifically, epsilon = time_diff / timescale As a general rule, the average will take on a completely new value after 'timescale' seconds.if1.0N(RyRvtepsilonRt _rolling_aveRztlast_ave(RRyRzRRvR}((RR{s   cCshy||}Wntj o d}nX|djo|Sn|djo|Sn||d||S(sKperform a "rolling average" iteration a rolling average "folds" new data into an existing average with some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive) a value of 0.0 means only the old value (initial value) counts, and a value of 1.0 means only the newest value is considered.iN(RzRyt recent_ratetZeroDivisionErrorRRR}(RRyRzRR}R((RR~s   f15.0cCst|djodSntti||tid}t|}|djo|Sntt||?|>S(sround the remaining time, depending on its size If rt is between n*start_time and (n+1)*start_time round downward to the nearest multiple of n (for any counting number n). If rt < start_time, round down to the nearest 1. For example (for start_time = 15.0): 2.7 -> 2.0 25.2 -> 25.0 26.4 -> 26.0 35.3 -> 34.0 63.6 -> 60.0 if0.0iN(RhR,tmathtlogR tshiftRb(RRhR R((Rt_round_remaining_times  &  ( RRRRRRR|R!R'R*R{R~R(((RR ws      icCs|djp |djo|odSqdSn_t|}|d}|d}|o)|d}|d}d|||fSnd||fSdS(Nis--:--:--s--:--i<s%02i:%02i:%02is %02i:%02i(tsecondsRt use_hoursR,tminutesthours(RRRR((RR#s     t c Csddddddddd g }|o d }nd }d }d }t|d}x3||jo%||jo|d}||}qWWt|tdjpt|tdjo d}n|djo d}nd}|t |pd |||fS(s4Turn numbers into human-readable metric-like numbersRrtktMtGtTtPtEtZtYf1000.0f1024.0iiils%i%s%sf9.9499999999999993s%.1f%s%ss%.0f%s%sN( tsymbolstSItsteptthreshtdepthtlent max_depthtnumberRtR_Rbtspace( RRRRR_RRRR((RRs$!   2   ( R2RRRBRRttext_progress_meterR4R?R^R R#R( RRBR^R?RR#R2R4RR RRR((Rt?s    ;.Tg 0707010002bc0a000081a40000000000000002000000014ae194ad00003f94000000b600010006ffffffffffffffff0000003e00000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/mirror.pycm Cc@sdZdkZdkZdklZlZlZydklZWne j oZ dZnXdfdYZ dfdYZ d e fd YZ d e fd YZed jondS(s| Module for downloading files from a pool of mirrors DESCRIPTION This module provides support for downloading files from a pool of mirrors with configurable failover policies. To a large extent, the failover policy is chosen by using different classes derived from the main class, MirrorGroup. Instances of MirrorGroup (and cousins) act very much like URLGrabber instances in that they have urlread, urlgrab, and urlopen methods. They can therefore, be used in very similar ways. from urlgrabber.grabber import URLGrabber from urlgrabber.mirror import MirrorGroup gr = URLGrabber() mg = MirrorGroup(gr, ['http://foo.com/some/directory/', 'http://bar.org/maybe/somewhere/else/', 'ftp://baz.net/some/other/place/entirely/'] mg.urlgrab('relative/path.zip') The assumption is that all mirrors are identical AFTER the base urls specified, so that any mirror can be used to fetch any file. FAILOVER The failover mechanism is designed to be customized by subclassing from MirrorGroup to change the details of the behavior. In general, the classes maintain a master mirror list and a "current mirror" index. When a download is initiated, a copy of this list and index is created for that download only. The specific failover policy depends on the class used, and so is documented in the class documentation. Note that ANY behavior of the class can be overridden, so any failover policy at all is possible (although you may need to change the interface in extreme cases). CUSTOMIZATION Most customization of a MirrorGroup object is done at instantiation time (or via subclassing). There are four major types of customization: 1) Pass in a custom urlgrabber - The passed in urlgrabber will be used (by default... see #2) for the grabs, so options to it apply for the url-fetching 2) Custom mirror list - Mirror lists can simply be a list of stings mirrors (as shown in the example above) but each can also be a dict, allowing for more options. For example, the first mirror in the list above could also have been: {'mirror': 'http://foo.com/some/directory/', 'grabber': , 'kwargs': { }} All mirrors are converted to this format internally. If 'grabber' is omitted, the default grabber will be used. If kwargs are omitted, then (duh) they will not be used. 3) Pass keyword arguments when instantiating the mirror group. See, for example, the failure_callback argument. 4) Finally, any kwargs passed in for the specific file (to the urlgrab method, for example) will be folded in. The options passed into the grabber's urlXXX methods will override any options specified in a custom mirror dict. N(s URLGrabErrorsCallbackObjectsDEBUG(t_cCs|S(N(tst(R((t]/export/xvm-gate/xvm-3.4///proto/install/usr/lib/python2.4/site-packages/urlgrabber/mirror.pyRcst GrabRequestcBstZdZRS(sThis is a dummy class used to hold information about the specific request. For example, a single file. By maintaining this information separately, we can accomplish two things: 1) make it a little easier to be threadsafe 2) have request-specific parameters (t__name__t __module__t__doc__(((RRes t MirrorGroupcBstZdZdZddgZdZdZdZdZdZ hd Z d Z d Z e d Zd Ze dZRS(s?Base Mirror class Instances of this class are built with a grabber object and a list of mirrors. Then all calls to urlXXX should be passed relative urls. The requested file will be searched for on the first mirror. If the grabber raises an exception (possibly after some retries) then that mirror will be removed from the list, and the next will be attempted. If all mirrors are exhausted, then an exception will be raised. MirrorGroup has the following failover policy: * downloads begin with the first mirror * by default (see default_action below) a failure (after retries) causes it to increment the local AND master indices. Also, the current mirror is removed from the local list (but NOT the master list - the mirror can potentially be used for other files) * if the local list is ever exhausted, a URLGrabError will be raised (errno=256, no more mirrors) OPTIONS In addition to the required arguments "grabber" and "mirrors", MirrorGroup also takes the following optional arguments: default_action A dict that describes the actions to be taken upon failure (after retries). default_action can contain any of the following keys (shown here with their default values): default_action = {'increment': 1, 'increment_master': 1, 'remove': 1, 'remove_master': 0, 'fail': 0} In this context, 'increment' means "use the next mirror" and 'remove' means "never use this mirror again". The two 'master' values refer to the instance-level mirror list (used for all files), whereas the non-master values refer to the current download only. The 'fail' option will cause immediate failure by re-raising the exception and no further attempts to get the current download. This dict can be set at instantiation time, mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) at method-execution time (only applies to current fetch), filename = mg.urlgrab(url, default_action={'increment': 0}) or by returning an action dict from the failure_callback return {'fail':0} in increasing precedence. If all three of these were done, the net result would be: {'increment': 0, # set in method 'increment_master': 1, # class default 'remove': 1, # class default 'remove_master': 0, # class default 'fail': 0} # set at instantiation, reset # from callback failure_callback this is a callback that will be called when a mirror "fails", meaning the grabber raises some URLGrabError. If this is a tuple, it is interpreted to be of the form (cb, args, kwargs) where cb is the actual callable object (function, method, etc). Otherwise, it is assumed to be the callable object itself. The callback will be passed a grabber.CallbackObject instance along with args and kwargs (if present). The following attributes are defined withing the instance: obj.exception = < exception that was raised > obj.mirror = < the mirror that was tried > obj.relative_url = < url relative to the mirror > obj.url = < full url that failed > # .url is just the combination of .mirror # and .relative_url The failure callback can return an action dict, as described above. Like default_action, the failure_callback can be set at instantiation time or when the urlXXX method is called. In the latter case, it applies only for that fetch. The callback can re-raise the exception quite easily. For example, this is a perfectly adequate callback function: def callback(obj): raise obj.exception WARNING: do not save the exception object (or the CallbackObject instance). As they contain stack frame references, they can lead to circular references. Notes: * The behavior can be customized by deriving and overriding the 'CONFIGURATION METHODS' * The 'grabber' instance is kept as a reference, not copied. Therefore, the grabber instance can be modified externally and changes will take effect immediately. cKsM||_|i||_d|_ti|_d|_ |i |dS(sInitialize the MirrorGroup object. REQUIRED ARGUMENTS grabber - URLGrabber instance mirrors - a list of mirrors OPTIONAL ARGUMENTS failure_callback - callback to be used when a mirror fails default_action - dict of failure actions See the module-level and class level documentation for more details. iN( tgrabbertselft_parse_mirrorstmirrorst_nexttthreadt allocate_lockt_locktNonetdefault_actiont_process_kwargstkwargs(R RR R((Rt__init__s   Rtfailure_callbackcCs(|id|_|id|_dS(NRR(RtgetR RR(R R((RRscCsTg}xG|D]?}t|tdjohd|<}n|i|q W|S(Nttmirror(tparsed_mirrorsR tmttypetappend(R R RR((RR scCs<|iit|i|_|i|_|iidS(N(R RtacquiretlistR tgrR trelease(R R((Rt_load_grs  cCs1|iptdtdn|i|iS(NisNo more mirrors to try.(RR t URLGrabErrorRR (R R((Rt _get_mirrors cCs|iidp|i}|oVt|tfjo|\}}}nfh}}||||ph}nh}t |i ph}|i|iidh|i||}|i|||o|iddondS(NRRtfaili(RtkwRR RtcbRtargsRtcb_objtactiontdictRtatupdatetincrement_mirror(R RR(R+R&R'RR)((Rt_failure!s  c Cs|i|i}|iiy|ii|}Wnt j onX|i ddo|i|=n7|i|jo&|i ddo|id7_n|it |ijo d|_n|ii |i ddo|i|i=n'|i ddo|id7_n|it |ijo d|_ntog}|iD]}||dqX~}tidd i||ig}|iD]}||dq~}tid d i||ind S( sTell the mirror object increment the mirror index This increments the mirror index, which amounts to telling the mirror object to use a different mirror (for this and future downloads). This is a SEMI-public method. It will be called internally, and you may never need to call it. However, it is provided (and is made public) so that the calling program can increment the mirror choice for methods like urlopen. For example, with urlopen, there's no good way for the mirror group to know that an error occurs mid-download (it's already returned and given you the file object). remove --- can have several values 0 do not remove the mirror from the list 1 remove the mirror for this download only 2 remove the mirror permanently beware of remove=0 as it can lead to infinite loops t remove_masteritincrement_masteritremovet incrementRsGR mirrors: [%s] %it sMAIN mirrors: [%s] %iN(RR R t badmirrorR RRtindextindt ValueErrorR)RtlenR tDEBUGt_[1]Rtgrmtinfotjointselfm( R RR)R>RR:R6R4R;((RR-;s2 #   ((cCs<|idp|ido ||Sn |d|SdS(Nt/(tbase_urltendswithtrel_urlt startswith(R R@RB((Rt _join_urlss  c Cst}||_||_t||_|i|x2|iD]'} y || =WqAt j oqAXqAWx |i |}|i |d|i}t|idh} | i||idp|i}t||} totid||ny| |f| SWqotj oe} totidnt}| |_|d|_|i|_||_|i||qoXqoWdS(NiRRRsMIRROR: trying %s -> %ssMIRROR: failed(RRtfuncturlR*R%R R!toptionstktKeyErrorR#t mirrorchoiceRDtfullurlRRR,Rtgetattrtfunc_refR9R<R"tetCallbackObjecttobjt exceptionRt relative_urlR.( R RERFR%RRJRKRRPRRHRMRN((Rt _mirror_tryys@             cKs/t|}||ddkZd kl Z l!Z!l"Z"l#Z#l$Z$l%Z%l&Z&Wn'ej oZfZ'eZ&dZ(n&Xe e!e"e#fZ'd Z(dk)Z)ye)i*Z+eZ,Wne-j oeZ+eZ,nXea.dZ/dZ0e0de1fdYZ2dfdYZ3edZ4dZ5edZ6dfdYZ7dfdYZ8dfdYZ9e9Z:dfdYZ;gZ<edZ=gZ>d Z?d!Z@d"ZAd#ZBd$ZCeddeeed%d&d'd(d)d*d+ged, ZDd-ZEd.ZFed/ZGd0ZHd1ZId2ZJd3ZKe d4joeEeFeGd5ndS(6s6A high-level cross-protocol url-grabber. GENERAL ARGUMENTS (kwargs) Where possible, the module-level default is indicated, and legal values are provided. copy_local = 0 [0|1] ignored except for file:// urls, in which case it specifies whether urlgrab should still make a copy of the file, or simply point to the existing copy. The module level default for this option is 0. close_connection = 0 [0|1] tells URLGrabber to close the connection after a file has been transfered. This is ignored unless the download happens with the http keepalive handler (keepalive=1). Otherwise, the connection is left open for further use. The module level default for this option is 0 (keepalive connections will not be closed). keepalive = 1 [0|1] specifies whether keepalive should be used for HTTP/1.1 servers that support it. The module level default for this option is 1 (keepalive is enabled). progress_obj = None a class instance that supports the following methods: po.start(filename, url, basename, length, text) # length will be None if unknown po.update(read) # read == bytes read so far po.end() text = None specifies an alternativ text item in the beginning of the progress bar line. If not given, the basename of the file is used. throttle = 1.0 a number - if it's an int, it's the bytes/second throttle limit. If it's a float, it is first multiplied by bandwidth. If throttle == 0, throttling is disabled. If None, the module-level default (which can be set on default_grabber.throttle) is used. See BANDWIDTH THROTTLING for more information. timeout = None a positive float expressing the number of seconds to wait for socket operations. If the value is None or 0.0, socket operations will block forever. Setting this option causes urlgrabber to call the settimeout method on the Socket object used for the request. See the Python documentation on settimeout for more information. http://www.python.org/doc/current/lib/socket-objects.html bandwidth = 0 the nominal max bandwidth in bytes/second. If throttle is a float and bandwidth == 0, throttling is disabled. If None, the module-level default (which can be set on default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for more information. range = None a tuple of the form (first_byte, last_byte) describing a byte range to retrieve. Either or both of the values may set to None. If first_byte is None, byte offset 0 is assumed. If last_byte is None, the last byte available is assumed. Note that the range specification is python-like in that (0,10) will yeild the first 10 bytes of the file. If set to None, no range will be used. reget = None [None|'simple'|'check_timestamp'] whether to attempt to reget a partially-downloaded file. Reget only applies to .urlgrab and (obviously) only if there is a partially downloaded file. Reget has two modes: 'simple' -- the local file will always be trusted. If there are 100 bytes in the local file, then the download will always begin 100 bytes into the requested file. 'check_timestamp' -- the timestamp of the server file will be compared to the timestamp of the local file. ONLY if the local file is newer than or the same age as the server file will reget be used. If the server file is newer, or the timestamp is not returned, the entire file will be fetched. NOTE: urlgrabber can do very little to verify that the partial file on disk is identical to the beginning of the remote file. You may want to either employ a custom "checkfunc" or simply avoid using reget in situations where corruption is a concern. user_agent = 'urlgrabber/VERSION' a string, usually of the form 'AGENT/VERSION' that is provided to HTTP servers in the User-agent header. The module level default for this option is "urlgrabber/VERSION". http_headers = None a tuple of 2-tuples, each containing a header and value. These will be used for http and https requests only. For example, you can do http_headers = (('Pragma', 'no-cache'),) ftp_headers = None this is just like http_headers, but will be used for ftp requests. proxies = None a dictionary that maps protocol schemes to proxy hosts. For example, to use a proxy server on host "foo" port 3128 for http and https URLs: proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' } note that proxy authentication information may be provided using normal URL constructs: proxies={ 'http' : 'http://user:host@foo:3128' } Lastly, if proxies is None, the default environment settings will be used. prefix = None a url prefix that will be prepended to all requested urls. For example: g = URLGrabber(prefix='http://foo.com/mirror/') g.urlgrab('some/file.txt') ## this will fetch 'http://foo.com/mirror/some/file.txt' This option exists primarily to allow identical behavior to MirrorGroup (and derived) instances. Note: a '/' will be inserted if necessary, so you cannot specify a prefix that ends with a partial file or directory name. opener = None Overrides the default urllib2.OpenerDirector provided to urllib2 when making requests. This option exists so that the urllib2 handler chain may be customized. Note that the range, reget, proxy, and keepalive features require that custom handlers be provided to urllib2 in order to function properly. If an opener option is provided, no attempt is made by urlgrabber to ensure chain integrity. You are responsible for ensuring that any extension handlers are present if said features are required. data = None Only relevant for the HTTP family (and ignored for other protocols), this allows HTTP POSTs. When the data kwarg is present (and not None), an HTTP request will automatically become a POST rather than GET. This is done by direct passthrough to urllib2. If you use this, you may also want to set the 'Content-length' and 'Content-type' headers with the http_headers option. Note that python 2.2 handles the case of these badly and if you do not use the proper case (shown here), your values will be overridden with the defaults. RETRY RELATED ARGUMENTS retry = None the number of times to retry the grab before bailing. If this is zero, it will retry forever. This was intentional... really, it was :). If this value is not supplied or is supplied but is None retrying does not occur. retrycodes = [-1,2,4,5,6,7] a sequence of errorcodes (values of e.errno) for which it should retry. See the doc on URLGrabError for more details on this. You might consider modifying a copy of the default codes rather than building yours from scratch so that if the list is extended in the future (or one code is split into two) you can still enjoy the benefits of the default list. You can do that with something like this: retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes if 12 not in retrycodes: retrycodes.append(12) checkfunc = None a function to do additional checks. This defaults to None, which means no additional checking. The function should simply return on a successful check. It should raise URLGrabError on an unsuccessful check. Raising of any other exception will be considered immediate failure and no retries will occur. If it raises URLGrabError, the error code will determine the retry behavior. Negative error numbers are reserved for use by these passed in functions, so you can use many negative numbers for different types of failure. By default, -1 results in a retry, but this can be customized with retrycodes. If you simply pass in a function, it will be given exactly one argument: a CallbackObject instance with the .url attribute defined and either .filename (for urlgrab) or .data (for urlread). For urlgrab, .filename is the name of the local file. For urlread, .data is the actual string data. If you need other arguments passed to the callback (program state of some sort), you can do so like this: checkfunc=(function, ('arg1', 2), {'kwarg': 3}) if the downloaded file has filename /tmp/stuff, then this will result in this call (for urlgrab): function(obj, 'arg1', 2, kwarg=3) # obj.filename = '/tmp/stuff' # obj.url = 'http://foo.com/stuff' NOTE: both the "args" tuple and "kwargs" dict must be present if you use this syntax, but either (or both) can be empty. failure_callback = None The callback that gets called during retries when an attempt to fetch a file fails. The syntax for specifying the callback is identical to checkfunc, except for the attributes defined in the CallbackObject instance. The attributes for failure_callback are: exception = the raised exception url = the url we're trying to fetch tries = the number of tries so far (including this one) retry = the value of the retry option The callback is present primarily to inform the calling program of the failure, but if it raises an exception (including the one it's passed) that exception will NOT be caught and will therefore cause future retries to be aborted. The callback is called for EVERY failure, including the last one. On the last try, the callback can raise an alternate exception, but it cannot (without severe trickiness) prevent the exception from being raised. interrupt_callback = None This callback is called if KeyboardInterrupt is received at any point in the transfer. Basically, this callback can have three impacts on the fetch process based on the way it exits: 1) raise no exception: the current fetch will be aborted, but any further retries will still take place 2) raise a URLGrabError: if you're using a MirrorGroup, then this will prompt a failover to the next mirror according to the behavior of the MirrorGroup subclass. It is recommended that you raise URLGrabError with code 15, 'user abort'. If you are NOT using a MirrorGroup subclass, then this is the same as (3). 3) raise some other exception (such as KeyboardInterrupt), which will not be caught at either the grabber or mirror levels. That is, it will be raised up all the way to the caller. This callback is very similar to failure_callback. They are passed the same arguments, so you could use the same function for both. urlparser = URLParser() The URLParser class handles pre-processing of URLs, including auth-handling for user/pass encoded in http urls, file handing (that is, filenames not sent as a URL), and URL quoting. If you want to override any of this behavior, you can pass in a replacement instance. See also the 'quote' option. quote = None Whether or not to quote the path portion of a url. quote = 1 -> quote the URLs (they're not quoted yet) quote = 0 -> do not quote them (they're already quoted) quote = None -> guess what to do This option only affects proper urls like 'file:///etc/passwd'; it does not affect 'raw' filenames like '/etc/passwd'. The latter will always be quoted as they are converted to URLs. Also, only the path part of a url is quoted. If you need more fine-grained control, you should probably subclass URLParser and pass it in via the 'urlparser' option. BANDWIDTH THROTTLING urlgrabber supports throttling via two values: throttle and bandwidth Between the two, you can either specify and absolute throttle threshold or specify a theshold as a fraction of maximum available bandwidth. throttle is a number - if it's an int, it's the bytes/second throttle limit. If it's a float, it is first multiplied by bandwidth. If throttle == 0, throttling is disabled. If None, the module-level default (which can be set with set_throttle) is used. bandwidth is the nominal max bandwidth in bytes/second. If throttle is a float and bandwidth == 0, throttling is disabled. If None, the module-level default (which can be set with set_bandwidth) is used. THROTTLING EXAMPLES: Lets say you have a 100 Mbps connection. This is (about) 10^8 bits per second, or 12,500,000 Bytes per second. You have a number of throttling options: *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float This will limit urlgrab to use half of your available bandwidth. *) set_throttle(6250000) # throttle is an int This will also limit urlgrab to use half of your available bandwidth, regardless of what bandwidth is set to. *) set_throttle(6250000); set_throttle(1.0) # float Use half your bandwidth *) set_throttle(6250000); set_throttle(2.0) # float Use up to 12,500,000 Bytes per second (your nominal max bandwidth) *) set_throttle(6250000); set_throttle(0) # throttle = 0 Disable throttling - this is more efficient than a very large throttle setting. *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0 Disable throttling - this is the default when the module is loaded. SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING) While this is flexible, it's not extremely obvious to the user. I suggest you implement a float throttle as a percent to make the distinction between absolute and relative throttling very explicit. Also, you may want to convert the units to something more convenient than bytes/second, such as kbps or kB/s, etc. N(t*sfrom t.is import __version__s???(t_cCs|S(N(tst(R((t^/export/xvm-gate/xvm-3.4///proto/install/usr/lib/python2.4/site-packages/urlgrabber/grabber.pyRs(s HTTPException(s HTTPHandlers HTTPSHandler(sHTTPRangeHandlersHTTPSRangeHandlersFileRangeHandlersFTPRangeHandlersrange_tuple_normalizesrange_tuple_to_headers RangeErroricCso|atotidjo |t_ntotidjo |t_ntidjo |t_ndS(sSet the DEBUG object. This is called by _init_default_logger when the environment variable URLGRABBER_DEBUG is set, but can also be called by a calling program. Basically, if the calling program uses the logging module and would like to incorporate urlgrabber logging, then it can do so this way. It's probably not necessary as most internal logging is only for debugging purposes. The passed-in object should be a logging.Logger instance. It will be pushed into the keepalive and byterange modules if they're being used. The mirror module pulls this object in on import, so you will need to manually push into it. In fact, you may find it tidier to simply push your logging object (or objects) into each of these modules independently. N(tDBOBJtDEBUGthave_keepalivet keepalivetNonet have_ranget byteranget sslfactory(R((Rt set_loggers  cCsTy!tidid}dk}|ii|dt|d}|djo t n|i d}t |djo|d}nd}|djo|iti}n3|djo|iti}n|i|}|i||id }|i||i|Wn"ttt fj o d}nXt|dS( stExamines the environment variable URLGRABBER_DEBUG and creates a logging object (logging.logger) based on the contents. It takes the form URLGRABBER_DEBUG=level,filename where "level" can be either an integer or a log level from the logging module (DEBUG, INFO, etc). If the integer is zero or less, logging will be disabled. Filename is the filename where logs will be sent. If it is "-", then stdout will be used. If the filename is empty or missing, stderr will be used. If the variable cannot be processed or the logging module cannot be imported (python < 2.3) then logging will be disabled. Here are some examples: URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout URLGRABBER_DEBUG=INFO # log info and higher to stderr This funtion is called during module initialization. It is not intended to be called from outside. The only reason it is a function at all is to keep the module-level namespace tidy and to collect the code into a nice block.tURLGRABBER_DEBUGt,Niis%(asctime)s %(message)stt-t urlgrabber(tostenvirontsplittdbinfotloggingt _levelNamestgettinttlevelt ValueErrort Formattert formattertlentfilenamet StreamHandlertsyststderrthandlertstdoutt FileHandlert setFormattert getLoggerRt addHandlertsetLeveltKeyErrort ImportErrorR R (RRR RRR$R((Rt_init_default_loggers. #      t URLGrabErrorcBstZdZRS(s URLGrabError error codes: URLGrabber error codes (0 -- 255) 0 - everything looks good (you should never see this) 1 - malformed url 2 - local file doesn't exist 3 - request for non-file local file (dir, etc) 4 - IOError on fetch 5 - OSError on fetch 6 - no content length header when we expected one 7 - HTTPException 8 - Exceeded read limit (for urlread) 9 - Requested byte range not satisfiable. 10 - Byte range requested, but range support unavailable 11 - Illegal reget mode 12 - Socket timeout 13 - malformed proxy url 14 - HTTPError (includes .code and .exception attributes) 15 - user abort MirrorGroup error codes (256 -- 511) 256 - No more mirrors left to try Custom (non-builtin) classes derived from MirrorGroup (512 -- 767) [ this range reserved for application-specific error codes ] Retry codes (< 0) -1 - retry the download, unknown reason Note: to test which group a code is in, you can simply do integer division by 256: e.errno / 256 Negative codes are reserved for use by functions passed in to retrygrab with checkfunc. The value -1 is built in as a generic retry code and is already included in the retrycodes list. Therefore, you can create a custom check function that simply returns -1 and the fetch will be re-tried. For more customized retries, you can use other negative number and include them in retry-codes. This is nice for outputting useful messages about what failed. You can use these error codes like so: try: urlgrab(url) except URLGrabError, e: if e.errno == 3: ... # or print e.strerror # or simply print e #### print '[Errno %i] %s' % (e.errno, e.strerror) (t__name__t __module__t__doc__(((RR.s 3tCallbackObjectcBstZdZdZRS(sContainer for returned callback data. This is currently a dummy class into which urlgrabber can stuff information for passing to callbacks. This way, the prototype for all callbacks is the same, regardless of the data that will be passed back. Any function that accepts a callback function as an argument SHOULD document what it will define in this object. It is possible that this class will have some greater functionality in the future. cKs|ii|dS(N(tselft__dict__tupdatetkwargs(R3R6((Rt__init__Hs(R/R0R1R7(((RR2<s cKsti|||S(sJgrab the file at and make a local copy at If filename is none, the basename of the url is used. urlgrab returns the filename of the local file, which may be different from the passed-in filename if the copy_local kwarg == 0. See module documentation for a description of possible kwargs. N(tdefault_grabberturlgrabturlR R6(R:R R6((RR9KscKsti||S(s0open the url and return a file object If a progress object or throttle specifications exist, then a special file object will be returned that supports them. The file object can be treated like any other file object. See module documentation for a description of possible kwargs. N(R8turlopenR:R6(R:R6((RR;UscKsti|||S(s`read the url into a string, up to 'limit' bytes If the limit is exceeded, an exception will be thrown. Note that urlread is NOT intended to be used as a way of saying "I want the first N bytes" but rather 'read the whole file into memory, but don't use too much' See module documentation for a description of possible kwargs. N(R8turlreadR:tlimitR6(R:R=R6((RR<_st URLParsercBsAtZdZdZdZdZdZdZdZRS(sGProcess the URLs before passing them to urllib2. This class does several things: * add any prefix * translate a "raw" file to a proper file: url * handle any http or https auth that's encoded within the url * quote the url Only the "parse" method is called directly, and it calls sub-methods. An instance of this class is held in the options object, which means that it's easy to change the behavior by sub-classing and passing the replacement in. It need only have a method like: url, parts = urlparser.parse(url, opts) c CsJ|i}|io|i||i}nti|}|\} }}} }}| p#t| djoc| tijoS|ddjoti i|}ndti|}ti|}d}n| ddgjo|i|}n|djo|i|}n|o|i|}nti|}||fS(sparse the url and return the (modified) url and its parts Note: a raw file WILL be quoted when it's converted to a URL. However, other urls (ones which come with a proper scheme) may or may not be quoted according to opts.quote opts.quote = 1 --> quote it opts.quote = 0 --> do not quote it opts.quote = None --> guess iis/\sfile:thttpthttpsN(toptstquotetprefixR3t add_prefixR:turlparsetpartstschemethosttpathtparmtquerytfragRtstringtlettersRtabspathturllibt pathname2urlt process_httpR tguess_should_quotet urlunparse( R3R:RARLRBRHRFRKRIRGRJ((Rtparse}s(   +  cCsB|ddjp|ddjo||}n|d|}|S(Nit/i(RCR:(R3R:RC((RRDs"c Cs|\} }} } }}d|jotoyE|idd\}}d|jo|idd\}}nWn.t j o"}tdtdtnXtotid||ntid|||n| || | ||fS(Nt@it:s Bad URL: %ssadding HTTP auth: %s, %s(RFRGRHRIRJRKRLt auth_handlerRt user_passtusertpasswordRteR.RR:Rtinfot add_passwordR ( R3RFRLR\R]RHR[RZRKRIRGRJ((RRRs  cCs=|\}}}}}}ti|}||||||fS(s quote the URL This method quotes ONLY the path part. If you need to quote other parts, you should override this and pass in your derived class. The other alternative is to quote other parts before passing into urlgrabber. N( RFRGRHRIRJRKRLRPRB(R3RFRLRHRKRIRGRJ((RRBst0123456789ABCDEFc Cs|\}}}} }}d|jodSnti|d}|djox|djot ||djodSn||d|d!i }|d|ijp|d|ijodSnti|d|d}qOWdSndS(s Guess whether we should quote a path. This amounts to guessing whether it's already quoted. find ' ' -> 1 find '%' -> 1 find '%XX' -> 0 else -> 1 t it%iiiN(RFRGRHRIRJRKRLRMtfindtindRtuppertcodeR3thexvals( R3RFRLRfRHRKRdRIRGRJ((RRSs    (( R/R0R1RURDRRRBRgRS(((RR>js  %   tURLGrabberOptionscBsGtZdZedZdZdZdZdZdZ RS(sClass to ease kwargs handling.cKs5||_|djo|in|i|dS(sInitialize URLGrabberOptions object. Set default values for all options and then update options specified in kwargs. N(tdelegateR3R t _set_defaultst_set_attributesR6(R3RiR6((RR7s   cCs>|io't|i|ot|i|Snt|dS(N(R3RithasattrtnametgetattrtAttributeError(R3Rm((Rt __getattr__scCsW|idjodSn<t|itdjot|iSn|i|iSdS(sRCalculate raw throttle value from throttle and bandwidth values. iN(R3tthrottlettypetfloatt bandwidth(R3((Rt raw_throttles cKstd||S(sCreate a derived URLGrabberOptions instance. This method creates a new instance and overrides the options specified in kwargs. RiN(RhR3R6(R3R6((RtderivescKs}|ii|to&|idot|i|_n|idddgjo#t dt d|ifndS(s7Update object attributes with those provided in kwargs.trangetsimpletcheck_timestampi sIllegal reget mode: %sN( R3R4R5R6R thas_keytrange_tuple_normalizeRwtregetR R.R(R3R6((RRks cCsd |_d|_d|_d |_ddddddg|_d |_d|_d|_ d |_ d t |_ d |_ d |_d |_d |_d |_d |_d |_t|_d |_d |_d |_d |_d |_t|_d |_d |_d |_d S( sSet all options to their default values. When adding new options, make sure a default is provided here. f1.0iiiiiiis urlgrabber/%siN( R R3t progress_objRqRttretryt retrycodest checkfunct copy_localtclose_connectionRwt __version__t user_agentRtproxiesR|tfailure_callbacktinterrupt_callbackRCtopenertTruet cache_openersttimeoutttextt http_headerst ftp_headerstdataR>t urlparserRBt ssl_ca_certt ssl_context(R3((RRj s8                         ( R/R0R1R R7RpRuRvRkRj(((RRhs    t URLGrabbercBsJtZdZdZdZdZedZedZdZ RS(sProvides easy opening of URLs with a variety of options. All options are specified as kwargs. Options may be specified when the class is created and may be overridden on a per request basis. New objects inherit default values from default_grabber. cKst||_dS(N(RhR6R3RA(R3R6((RR77sc Gsd} x| d} d}d} d} to!tid| |i|dny9t ||f|h} totidn| SWnPt j o!}|}|i} |i} n%tj o}|}|i} nXtotid|n| oototid| n|i| \}} }td|d|dd | d |i}||| |n|idjp| |ijototid nn| dj o8| |ijo(totid | |inq q WdS( Niisattempt %i/%s: %stsuccesss exception: %sscalling callback: %st exceptionR:ttriesR~sretries exceeded, re-raisings)retrycode (%i) not in list %s, re-raising(RR Rt retrycodetcallbackRR^RAR~targstapplytfunctrR.R]RterrnotKeyboardInterruptRR3t_make_callbacktcb_functcb_argst cb_kwargsR2tobjR(R3RARRRRR]RRRRRRR((Rt_retry:sP      cKsI|ii|}|ii||\}}d}|i |||S(sopen the url and return a file object If a progress object or throttle value specified when this object was created, then a special file object will be returned that supports them. The file object can be treated like any other file object. cCst|ddd|S(NR RA(tURLGrabberFileObjectR:R RA(RAR:((Rt retryfuncrsN( R3RARvR6RRUR:RFRR(R3R:R6RFRRA((RR;is  c  s[ii|} | ii|| \}}|\} }}} }}|djoti iti|}n| djo| i oti|}|oti id||}nti i|p tdtd|fq9ti i|p tdtd|fq9| ip|Sq9nd} i| | ||S( s grab the file at and make a local copy at If filename is none, the basename of the url is used. urlgrab returns the filename of the local file, which may be different from the passed-in filename if copy_local == 0. tfiles//isLocal file does not exist: %sisNot a normal file: %scst|||}zo|i|idj oQi |i\}}}t }||_||_t||f||nWd|iX|S(N(RR:R RAtfot_do_grabRR R3RRRRR2RRtclose(RAR:R RRRRR(R3(RRs     N(R3RARvR6RRUR:RFRGRHRIRJRKRLR R RtbasenameRPtunquoteRt url2pathnametnormpathtexistsR.RtisfileRwRR( R3R:R R6RLRHRFRKRIRGRJRAR((R3RR9vs(    c sii|}|ii||\}}|dj o|d}nd}i ||||}|o6t ||jo#tdtd||fn|S(s2read the url into a string, up to 'limit' bytes If the limit is exceeded, an exception will be thrown. Note that urlread is NOT intended to be used as a way of saying "I want the first N bytes" but rather 'read the whole file into memory, but don't use too much' ic st|ddd|}d}z|djo|i}n|i|}|idj oQi |i\}}}t}||_||_t||f||nWd|iX|S(NR RAR(RR:R RARtsR=treadRR3RRRRR2RRRR( RAR:R=RRRRRR(R3(RRs     isExceeded limit (%i): %sN(R3RARvR6RRUR:RFR=R RRRRR.R(R3R:R=R6RRFRRA((R3RR<s  cCs&t|o|fhfSn|SdS(N(tcallablet callback_obj(R3R((RRs ( R/R0R1R7RR;R R9R<R(((RR.s   / + (RcBstZdZdZdZdZdZdZdZdZ dZ e d Z e d Z d d Zd ZRS(sThis is a file-object wrapper that supports progress objects and throttling. This exists to solve the following problem: lets say you want to drop-in replace a normal open with urlopen. You want to use a progress meter and/or throttling, but how do you do that without rewriting your code? Answer: urlopen will return a wrapped file object that does the progress meter and-or throttling internally. cCsr||_||_||_d|_d|_dd|_ti|_ d|_ d|_ d|_ |i dS(NRiii(R:R3R RAR Rt_rbuft _rbufsizettimet_ttimet_tsizet _amount_readt_openert_do_open(R3R:R RA((RR7s         cCs4t|i|ot|i|Snt|dS(sThis effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for in self.fo. This includes methods.N(RlR3RRmRnRo(R3Rm((RRpscCs|iio|iiSnf|idjoUg}to |ii}t o|ii p |ii }|ii oY|it|ii |p|itin|p|itiqnti|ii|ii}|o'|it|it|n|o|it n|it|iiot|||_n|i||_g|i_n|iS(s8Build a urllib2 OpenerDirector based on request options.N(R3RARRR thandlersRRtneed_keepalive_handlertrange_handlersRwR|tneed_range_handlerRtappendtCachedProxyHandlerturllib2t HTTPHandlert FTPHandlerR t get_factoryRRt ssl_factoryt HTTPSHandlertextendRYRtCachedOpenerDirectort create_openert addheaders(R3RRRR((Rt _get_openers4     cCs|i}ti|i|ii}|i ||i ||i ||\} } |io|iidjod}y<| id} ti| }||ijo d}nWntfj o d}nX|oI| id|i_|id=|i ||i ||\} } q+nti|i\}}} }}}t!i"| } |ii#p|ii$p |ii%p0| i&|_&t'| do| i(|_(qkn|ii#oy!t)| d} | |i+} Wn"t,t-tfj o d} nX|ii#i.t/|i0t!i"|it1ii2| | d|ii3|ii#i4dn| | |_ |_ dS( NRyis last-modifieditRangetreadlinesContent-LengthR(5R3RRRtRequestR:RARtreqt _add_headerst _build_ranget _make_requestRthdrt reget_timeR|t fetch_againt getdate_tztmodified_tupletrfc822t mktime_tztmodified_stampt TypeErrorRR theadersRERGRHRIRJRKRLRPRR}RuRRRlRRtlengthRR+RtstarttstrR RRRR5(R3RRKRRRGRLRJRHRIRRRRR((RR)sL         $*   cCs|iio|id|iiny|i}Wntj o d}nX|ii o>|djo1x.|ii D]\}}|i||qzWn|ii o>|djo1x.|ii D]\}}|i||qWndS(Ns User-agentR?R@tftp(shttpshttps( R3RARRt add_headertget_typetreq_typeRR RthtvR(R3RRRR((RR^s    cCsMd|_d|_d}d}to|iiot |i t djoeyt i |i }Wntj oqX|t|_|t}||_|df}d|_n|iioWtptdtdn|ii}|do|d||df}qn|o+t|}|o|id|qIndS(NiRii s2Byte range requested but range support unavailableR(R R3RRt reget_lengthtrtR RAR|RrR RtstatRtOSErrortST_MTIMEtST_SIZERRwR.Rtrange_tuple_to_headertheaderRR(R3RRRRR((RRjs0  0         cCsyztoT|iioGti}ti|iiz|i |}Wdti|Xn|i |}|i }Wntj o%}tdtd|fn_tj o}tdt|n6tij o9}tdt|}|i|_||_|ntj oo}t|do:to3t|ito tdtd|fq tdtd |fnutj o%}td td |fnEtj o.}td td |i i!|fn X||fSdS(Nis Bad URL: %si itreasoni s Timeout: %sis IOError: %sis OSError: %sisHTTP Exception (%s): %s("thave_socket_timeoutR3RARtsockettgetdefaulttimeouttold_totsetdefaulttimeoutRtopenRRR^RRR]R.Rt RangeErrorRRt HTTPErrortnew_eRfRtIOErrorRlt isinstanceRt TimeoutErrorRt HTTPExceptiont __class__R/(R3RRRRRR]R((RRs8     *   *cCs|iot|id}nt|id}dd}d}|i|}|t |}x8|o0|i ||i|}|t |}qdW|i y>|i i d}ti|}ti|i||fWntfj o }nX|S(sdump the file to self.filename.tabtwbiiis last-modifiedN(R3RRR tnew_fotbstsizeRtblockRtwriteRRRRRRRRtutimeRR](R3R]RRRRRR((RRs*    c Cs^|io?|dj o2t|i}||jo||}qIdSn|ig} t|i}x|djp|o|ii oX|i |ii t i |i }|djot i|nt i |_ n|djo |i}nt||i}y|ii|}Wntij o%}tdtd|fnbtj o%}tdtd|fn2tj o%}tdtd|fnXt|}|pPn|o||}n| i|||}||_ |i||_|iio|iii |iqgqgWt!i"| d|_dS( sfill the buffer to contain at least 'amt' bytes by reading from the underlying file object. If amt is None, then it will read until it gets nothing more. It updates the progress meter and throttles after every self._rbufsize bytes.NiisSocket Error: %si s Timeout: %ss IOError: %sR(#R3RtamtR RtLtbuftbufsizeRARuRRRtdifftsleepRt readamounttminRRtnewRterrorR]R.RRRtnewsizeRRR}R5RMtjoin( R3RRR]RRR R R R ((Rt _fill_buffersN  '    !     cCsS|i||djo|id}|_n|i| |i|}|_|S(NR(R3RRR RR(R3RR((RRs   icCs%ti|id}x|djod|jot|ijn oZt|i}|i||i t|i|jpPnti|id|}qW|djot|i}n |d}d|jot|ijno |}n|i| |i|}|_|S(Ns ii( RMRcR3RtiR=RRRRR(R3R=RRR((RRs5  ' cCsd|iio|iii|in|ii|iioy|iiWq`q`XndS(N(R3RAR}tendRRRR(R3((RRs   (R/R0R1R7RpRRRRRRR RRRR(((RRs   6 5 #  4  cGsxJtD]B\}}||jo)x|iD]}|i|q*W|SqqW|pti}n|i |}ti ||f|S(N( t_handler_cachetcached_handlersRRR$t add_parentRR RRR(RRR$RR((RRs    c CsxtD]9\}}||jo totid|nPqqWx{|iD]m\}}t i |\}}t i |\}}|djp |djotdtd|qQqQWtotid|nti|}ti||f|S(Nsre-using proxy settings: %si sBad proxy URL: %sscreating new proxy handler: %s(t _proxy_cachetpdictR$RRtdebugtitemstkRRPt splittypetutypeR:t splithostRHtotherR R.RR^Rt ProxyHandlerR( RRRRR:RHR$RR!((RR%s"    !cCs |t_dS(s8Deprecated. Use: default_grabber.throttle = new_throttleN(t new_throttleR8Rq(R#((Rt set_throttle8scCs |t_dS(s:Deprecated. Use: default_grabber.bandwidth = new_bandwidthN(t new_bandwidthR8Rt(R%((Rt set_bandwidth<scCs |t_dS(s@Deprecated. Use: default_grabber.progress_obj = new_progress_objN(tnew_progress_objR8R}(R'((Rtset_progress_obj@scCs |t_dS(s<Deprecated. Use: default_grabber.user_agent = new_user_agentN(tnew_user_agentR8R(R)((Rtset_user_agentDsiiiiiiic Cs^hd|<d|<d|<d|<d|<d|<d|<d| <} t ||| S( s5Deprecated. Use: urlgrab() with the retry arg insteadRRR}RqRtR~RRN( RRR}RqRttnumtriesRRR6R9R:R ( R:R RRR}RqRtR+RRR6((Rt retrygrabHsN c CsIdk}y|idd!\}}Wn1tj o%dG|idGdGH|inXh}x@|idD]1}ti |dd\}}t ||| [copy_local=0|1] [close_connection=0|1]t=f1.0i is)throttle: %s, throttle bandwidth: %s B/s(stext_progress_meterR}s LOCAL FILE:(R"targvR:R RtexitR6taRMRRRRR$R&R8RqRttprogressttext_progress_meterR,R]RR9RmR.( R0R]R:RRR R"R2R6Rm((Rt _main_testZs0    c CsDdk}y|idd!\}}Wn1tj o%dG|idGdGH|inXh}x@|idD]1}ti |dd\}}t ||| [copy_local=0|1] [close_connection=0|1]R-(stext_progress_meterR}tfoocCsq|G|GHdk}|i}|djodGHtddn|djodGHtddndGHdS( Nf0.5s forcing retryif0.75sforcing failureisforcing immediate failureR(thellottheretrandomtrnumR.(R R5R6R7R8((Rtcfuncs     R5R6Rs LOCAL FILE:(shello(R"R.R:R RR/R6R0RMRRRRR1R2R,R]R9RR,RmR.( R0R]R:RRR R"R2R6R9Rm((Rt _retry_testus.    c Csdk}dk}dk}|djo t}nd|GHt|} | i}| i xt t t tgD]r} |i|}|i}t|dd}d| iG| |||i}||jo dGHqpdGHqpWdS(Nsusing file "%s" for comparisonsistesting %-30s tpassedtFAILED(R7t cStringIOR"R R t__file__RRRts_inputRt_test_file_object_smallreadt_test_file_object_readallt_test_file_object_readlinet_test_file_object_readlinesttestfunctStringIOtfo_inputt fo_outputRtwrapperR/tgetvaluets_output( R R=R?RFR7RHR"RGRJRDR((Rt_file_object_tests$           cCs7x0|id}|i||pdSqqWdS(Nii(RHRRRGR(RHRGR((RR@s  cCs|i}|i|dS(N(RHRRRGR(RHRGR((RRAs cCs4x-|i}|i||pdSqqWdS(Ni(RHRRRGR(RHRGR((RRBs   cCs)|i}|iti|ddS(NR(RHt readlinestliRGRRMR(RHRGRM((RRCs t__main__ttest(LR1Rtos.pathR"RERRRMRPRRR/RRR tHTTPBasicAuthHandlertHTTPPasswordMgrWithDefaultRealmRYti18nRR,tmsgthttplibRR RRRRRtFalseR tHTTPRangeHandlertHTTPSRangeHandlertFileRangeHandlertFTPRangeHandlerR{RRRR RRRRRoRR R-RR.R2R9R;R<R>RhRR8RRRRRR$R&R(R*R,R3R:RKR@RARBRC(;RR3R*R RKRRCRR&R<RR8RR9R-RR@RPRRRRAR R.RYRTRR{RMRRR;RRR"RRR RYRRRBR RXRRRRhR$RERWR(R>R:RRZR,RR2((Rt?ms         !       5       -6 uO E      6  #      0707010002bc09000081a400000000000000020000000143fcace600004695000000b600010006ffffffffffffffff0000003d00000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/mirror.py# This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., # 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko """Module for downloading files from a pool of mirrors DESCRIPTION This module provides support for downloading files from a pool of mirrors with configurable failover policies. To a large extent, the failover policy is chosen by using different classes derived from the main class, MirrorGroup. Instances of MirrorGroup (and cousins) act very much like URLGrabber instances in that they have urlread, urlgrab, and urlopen methods. They can therefore, be used in very similar ways. from urlgrabber.grabber import URLGrabber from urlgrabber.mirror import MirrorGroup gr = URLGrabber() mg = MirrorGroup(gr, ['http://foo.com/some/directory/', 'http://bar.org/maybe/somewhere/else/', 'ftp://baz.net/some/other/place/entirely/'] mg.urlgrab('relative/path.zip') The assumption is that all mirrors are identical AFTER the base urls specified, so that any mirror can be used to fetch any file. FAILOVER The failover mechanism is designed to be customized by subclassing from MirrorGroup to change the details of the behavior. In general, the classes maintain a master mirror list and a "current mirror" index. When a download is initiated, a copy of this list and index is created for that download only. The specific failover policy depends on the class used, and so is documented in the class documentation. Note that ANY behavior of the class can be overridden, so any failover policy at all is possible (although you may need to change the interface in extreme cases). CUSTOMIZATION Most customization of a MirrorGroup object is done at instantiation time (or via subclassing). There are four major types of customization: 1) Pass in a custom urlgrabber - The passed in urlgrabber will be used (by default... see #2) for the grabs, so options to it apply for the url-fetching 2) Custom mirror list - Mirror lists can simply be a list of stings mirrors (as shown in the example above) but each can also be a dict, allowing for more options. For example, the first mirror in the list above could also have been: {'mirror': 'http://foo.com/some/directory/', 'grabber': , 'kwargs': { }} All mirrors are converted to this format internally. If 'grabber' is omitted, the default grabber will be used. If kwargs are omitted, then (duh) they will not be used. 3) Pass keyword arguments when instantiating the mirror group. See, for example, the failure_callback argument. 4) Finally, any kwargs passed in for the specific file (to the urlgrab method, for example) will be folded in. The options passed into the grabber's urlXXX methods will override any options specified in a custom mirror dict. """ # $Id: mirror.py,v 1.14 2006/02/22 18:26:46 mstenner Exp $ import random import thread # needed for locking to make this threadsafe from grabber import URLGrabError, CallbackObject, DEBUG try: from i18n import _ except ImportError, msg: def _(st): return st class GrabRequest: """This is a dummy class used to hold information about the specific request. For example, a single file. By maintaining this information separately, we can accomplish two things: 1) make it a little easier to be threadsafe 2) have request-specific parameters """ pass class MirrorGroup: """Base Mirror class Instances of this class are built with a grabber object and a list of mirrors. Then all calls to urlXXX should be passed relative urls. The requested file will be searched for on the first mirror. If the grabber raises an exception (possibly after some retries) then that mirror will be removed from the list, and the next will be attempted. If all mirrors are exhausted, then an exception will be raised. MirrorGroup has the following failover policy: * downloads begin with the first mirror * by default (see default_action below) a failure (after retries) causes it to increment the local AND master indices. Also, the current mirror is removed from the local list (but NOT the master list - the mirror can potentially be used for other files) * if the local list is ever exhausted, a URLGrabError will be raised (errno=256, no more mirrors) OPTIONS In addition to the required arguments "grabber" and "mirrors", MirrorGroup also takes the following optional arguments: default_action A dict that describes the actions to be taken upon failure (after retries). default_action can contain any of the following keys (shown here with their default values): default_action = {'increment': 1, 'increment_master': 1, 'remove': 1, 'remove_master': 0, 'fail': 0} In this context, 'increment' means "use the next mirror" and 'remove' means "never use this mirror again". The two 'master' values refer to the instance-level mirror list (used for all files), whereas the non-master values refer to the current download only. The 'fail' option will cause immediate failure by re-raising the exception and no further attempts to get the current download. This dict can be set at instantiation time, mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) at method-execution time (only applies to current fetch), filename = mg.urlgrab(url, default_action={'increment': 0}) or by returning an action dict from the failure_callback return {'fail':0} in increasing precedence. If all three of these were done, the net result would be: {'increment': 0, # set in method 'increment_master': 1, # class default 'remove': 1, # class default 'remove_master': 0, # class default 'fail': 0} # set at instantiation, reset # from callback failure_callback this is a callback that will be called when a mirror "fails", meaning the grabber raises some URLGrabError. If this is a tuple, it is interpreted to be of the form (cb, args, kwargs) where cb is the actual callable object (function, method, etc). Otherwise, it is assumed to be the callable object itself. The callback will be passed a grabber.CallbackObject instance along with args and kwargs (if present). The following attributes are defined withing the instance: obj.exception = < exception that was raised > obj.mirror = < the mirror that was tried > obj.relative_url = < url relative to the mirror > obj.url = < full url that failed > # .url is just the combination of .mirror # and .relative_url The failure callback can return an action dict, as described above. Like default_action, the failure_callback can be set at instantiation time or when the urlXXX method is called. In the latter case, it applies only for that fetch. The callback can re-raise the exception quite easily. For example, this is a perfectly adequate callback function: def callback(obj): raise obj.exception WARNING: do not save the exception object (or the CallbackObject instance). As they contain stack frame references, they can lead to circular references. Notes: * The behavior can be customized by deriving and overriding the 'CONFIGURATION METHODS' * The 'grabber' instance is kept as a reference, not copied. Therefore, the grabber instance can be modified externally and changes will take effect immediately. """ # notes on thread-safety: # A GrabRequest should never be shared by multiple threads because # it's never saved inside the MG object and never returned outside it. # therefore, it should be safe to access/modify grabrequest data # without a lock. However, accessing the mirrors and _next attributes # of the MG itself must be done when locked to prevent (for example) # removal of the wrong mirror. ############################################################## # CONFIGURATION METHODS - intended to be overridden to # customize behavior def __init__(self, grabber, mirrors, **kwargs): """Initialize the MirrorGroup object. REQUIRED ARGUMENTS grabber - URLGrabber instance mirrors - a list of mirrors OPTIONAL ARGUMENTS failure_callback - callback to be used when a mirror fails default_action - dict of failure actions See the module-level and class level documentation for more details. """ # OVERRIDE IDEAS: # shuffle the list to randomize order self.grabber = grabber self.mirrors = self._parse_mirrors(mirrors) self._next = 0 self._lock = thread.allocate_lock() self.default_action = None self._process_kwargs(kwargs) # if these values are found in **kwargs passed to one of the urlXXX # methods, they will be stripped before getting passed on to the # grabber options = ['default_action', 'failure_callback'] def _process_kwargs(self, kwargs): self.failure_callback = kwargs.get('failure_callback') self.default_action = kwargs.get('default_action') def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: if type(m) == type(''): m = {'mirror': m} parsed_mirrors.append(m) return parsed_mirrors def _load_gr(self, gr): # OVERRIDE IDEAS: # shuffle gr list self._lock.acquire() gr.mirrors = list(self.mirrors) gr._next = self._next self._lock.release() def _get_mirror(self, gr): # OVERRIDE IDEAS: # return a random mirror so that multiple mirrors get used # even without failures. if not gr.mirrors: raise URLGrabError(256, _('No more mirrors to try.')) return gr.mirrors[gr._next] def _failure(self, gr, cb_obj): # OVERRIDE IDEAS: # inspect the error - remove=1 for 404, remove=2 for connection # refused, etc. (this can also be done via # the callback) cb = gr.kw.get('failure_callback') or self.failure_callback if cb: if type(cb) == type( () ): cb, args, kwargs = cb else: args, kwargs = (), {} action = cb(cb_obj, *args, **kwargs) or {} else: action = {} # XXXX - decide - there are two ways to do this # the first is action-overriding as a whole - use the entire action # or fall back on module level defaults #action = action or gr.kw.get('default_action') or self.default_action # the other is to fall through for each element in the action dict a = dict(self.default_action or {}) a.update(gr.kw.get('default_action', {})) a.update(action) action = a self.increment_mirror(gr, action) if action and action.get('fail', 0): raise def increment_mirror(self, gr, action={}): """Tell the mirror object increment the mirror index This increments the mirror index, which amounts to telling the mirror object to use a different mirror (for this and future downloads). This is a SEMI-public method. It will be called internally, and you may never need to call it. However, it is provided (and is made public) so that the calling program can increment the mirror choice for methods like urlopen. For example, with urlopen, there's no good way for the mirror group to know that an error occurs mid-download (it's already returned and given you the file object). remove --- can have several values 0 do not remove the mirror from the list 1 remove the mirror for this download only 2 remove the mirror permanently beware of remove=0 as it can lead to infinite loops """ badmirror = gr.mirrors[gr._next] self._lock.acquire() try: ind = self.mirrors.index(badmirror) except ValueError: pass else: if action.get('remove_master', 0): del self.mirrors[ind] elif self._next == ind and action.get('increment_master', 1): self._next += 1 if self._next >= len(self.mirrors): self._next = 0 self._lock.release() if action.get('remove', 1): del gr.mirrors[gr._next] elif action.get('increment', 1): gr._next += 1 if gr._next >= len(gr.mirrors): gr._next = 0 if DEBUG: grm = [m['mirror'] for m in gr.mirrors] DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next) selfm = [m['mirror'] for m in self.mirrors] DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next) ##################################################################### # NON-CONFIGURATION METHODS # these methods are designed to be largely workhorse methods that # are not intended to be overridden. That doesn't mean you can't; # if you want to, feel free, but most things can be done by # by overriding the configuration methods :) def _join_url(self, base_url, rel_url): if base_url.endswith('/') or rel_url.startswith('/'): return base_url + rel_url else: return base_url + '/' + rel_url def _mirror_try(self, func, url, kw): gr = GrabRequest() gr.func = func gr.url = url gr.kw = dict(kw) self._load_gr(gr) for k in self.options: try: del kw[k] except KeyError: pass while 1: mirrorchoice = self._get_mirror(gr) fullurl = self._join_url(mirrorchoice['mirror'], gr.url) kwargs = dict(mirrorchoice.get('kwargs', {})) kwargs.update(kw) grabber = mirrorchoice.get('grabber') or self.grabber func_ref = getattr(grabber, func) if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl) try: return func_ref( *(fullurl,), **kwargs ) except URLGrabError, e: if DEBUG: DEBUG.info('MIRROR: failed') obj = CallbackObject() obj.exception = e obj.mirror = mirrorchoice['mirror'] obj.relative_url = gr.url obj.url = fullurl self._failure(gr, obj) def urlgrab(self, url, filename=None, **kwargs): kw = dict(kwargs) kw['filename'] = filename func = 'urlgrab' return self._mirror_try(func, url, kw) def urlopen(self, url, **kwargs): kw = dict(kwargs) func = 'urlopen' return self._mirror_try(func, url, kw) def urlread(self, url, limit=None, **kwargs): kw = dict(kwargs) kw['limit'] = limit func = 'urlread' return self._mirror_try(func, url, kw) class MGRandomStart(MirrorGroup): """A mirror group that starts at a random mirror in the list. This behavior of this class is identical to MirrorGroup, except that it starts at a random location in the mirror list. """ def __init__(self, grabber, mirrors, **kwargs): """Initialize the object The arguments for intialization are the same as for MirrorGroup """ MirrorGroup.__init__(self, grabber, mirrors, **kwargs) self._next = random.randrange(len(mirrors)) class MGRandomOrder(MirrorGroup): """A mirror group that uses mirrors in a random order. This behavior of this class is identical to MirrorGroup, except that it uses the mirrors in a random order. Note that the order is set at initialization time and fixed thereafter. That is, it does not pick a random mirror after each failure. """ def __init__(self, grabber, mirrors, **kwargs): """Initialize the object The arguments for intialization are the same as for MirrorGroup """ MirrorGroup.__init__(self, grabber, mirrors, **kwargs) random.shuffle(self.mirrors) if __name__ == '__main__': pass 0707010002bc01000081a40000000000000002000000014513354f000008d3000000b600010006ffffffffffffffff0000003f00000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/__init__.py# This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Library General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko # $Id: __init__.py,v 1.20 2006/09/22 00:58:55 mstenner Exp $ """A high-level cross-protocol url-grabber. Using urlgrabber, data can be fetched in three basic ways: urlgrab(url) copy the file to the local filesystem urlopen(url) open the remote file and return a file object (like urllib2.urlopen) urlread(url) return the contents of the file as a string When using these functions (or methods), urlgrabber supports the following features: * identical behavior for http://, ftp://, and file:// urls * http keepalive - faster downloads of many files by using only a single connection * byte ranges - fetch only a portion of the file * reget - for a urlgrab, resume a partial download * progress meters - the ability to report download progress automatically, even when using urlopen! * throttling - restrict bandwidth usage * retries - automatically retry a download if it fails. The number of retries and failure types are configurable. * authenticated server access for http and ftp * proxy support - support for authenticated http and ftp proxies * mirror groups - treat a list of mirrors as a single source, automatically switching mirrors if there is a failure. """ __version__ = '3.1.0' __date__ = '2006/09/21' __author__ = 'Michael D. Stenner , ' \ 'Ryan Tomayko ' __url__ = 'http://linux.duke.edu/projects/urlgrabber/' from grabber import urlgrab, urlopen, urlread 0707010002bc08000081a40000000000000002000000014ae194ad00004b1c000000b600010006ffffffffffffffff0000004100000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/keepalive.pycm 5Ec@sdZdkZdkZdkZdkZdadkZdkZei djo da nda dfdYZ dfd YZ d e ei fd YZ d e eifd YZdeifdYZdeifdYZdeifdYZdZdZdZddZdZddZedjoadkZdkZy$eeidZeidZWndeidGHqXeeendS(s An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive. >>> import urllib2 >>> from keepalive import HTTPHandler >>> keepalive_handler = HTTPHandler() >>> opener = urllib2.build_opener(keepalive_handler) >>> urllib2.install_opener(opener) >>> >>> fo = urllib2.urlopen('http://www.python.org') If a connection to a given host is requested, and all of the existing connections are still in use, another connection will be opened. If the handler tries to use an existing connection but it fails in some way, it will be closed and removed from the pool. To remove the handler, simply re-run build_opener with no arguments, and install that opener. You can explicitly close connections by using the close_connection() method of the returned file-like object (described below) or you can use the handler methods: close_connection(host) close_all() open_connections() NOTE: using the close_connection and close_all methods of the handler should be done with care when using multiple threads. * there is nothing that prevents another thread from creating new connections immediately after connections are closed * no checks are done to prevent in-use connections from being closed >>> keepalive_handler.close_all() EXTRA ATTRIBUTES AND METHODS Upon a status of 200, the object returned has a few additional attributes and methods, which should not be used if you want to remain consistent with the normal urllib2-returned objects: close_connection() - close the connection to the host readlines() - you know, readlines() status - the return status (ie 404) reason - english translation of status (ie 'File not found') If you want the best of both worlds, use this inside an AttributeError-catching try: >>> try: status = fo.status >>> except AttributeError: status = None Unfortunately, these are ONLY there if status == 200, so it's not easy to distinguish between non-200 responses. The reason is that urllib2 tries to do clever things with error codes 301, 302, 401, and 407, and it wraps the object upon return. For python versions earlier than 2.4, you can avoid this fancy error handling by setting the module-level global HANDLE_ERRORS to zero. You see, prior to 2.4, it's the HTTP Handler's job to determine what to handle specially, and what to just pass up. HANDLE_ERRORS == 0 means "pass everything up". In python 2.4, however, this job no longer belongs to the HTTP Handler and is now done by a NEW handler, HTTPErrorProcessor. Here's the bottom line: python version < 2.4 HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as errors HANDLE_ERRORS == 0 pass everything up, error processing is left to the calling code python version >= 2.4 HANDLE_ERRORS == 1 pass up 200, treat the rest as errors HANDLE_ERRORS == 0 (default) pass everything up, let the other handlers (specifically, HTTPErrorProcessor) decide what to do In practice, setting the variable either way makes little difference in python 2.4, so for the most consistent behavior across versions, you probably just want to use the defaults, which will give you exceptions on errors. NiiiitConnectionManagercBsGtZdZdZdZdZdZdZedZ RS(sV The connection manager must be able to: * keep track of all existing cCs.ti|_h|_h|_h|_dS(N(tthreadt allocate_locktselft_lockt_hostmapt_connmapt _readymap(R((t`/export/xvm-gate/xvm-3.4///proto/install/usr/lib/python2.4/site-packages/urlgrabber/keepalive.pyt__init__zs  cCsx|iizV|ii|pg|i| host is the host:port spec, as in 'www.cnn.com:8080' as passed in. no error occurs if there is no connection to that host.N(RR!RR thRtclose(RR R'((Rtclose_connections cCsUxN|iiiD]7\}}x(|D] }|ii||iq)WqWdS(sclose all open connectionsN( RR!RR#R tconnsR'RR((RR*R'R ((Rt close_alls cCs|ii|ddS(sdtells us that this request is now closed and the the connection is ready for another requestiN(RR!RR(RtrequestR R((Rt_request_closedsicCs)|o|in|ii|dS(N(R(RRR!R(RR RR(((Rt_remove_connectionscCs |i}|ptidny|ii|}x|oQ|i |||}|oPn|i |ii ||ii|}q>W|i |}totid|t|n|ii||d|i|||i}Wn0titifj o}ti|nX|io|ii |ntotid|i|in||_||_|i|_||_ |i|_!|i"|_#|i|_"|idjpt$ o|Sn)|i%id|||i|i"|i#SdS(Ns no host givens"creating new connection to %s (%d)isSTATUS: %s, %sithttp(&treqtget_hostR turllib2tURLErrorRR!RR't_reuse_connectiontrR(Rt_get_connectiontDEBUGtinfotidRt_start_transactiont getresponsetsocketterrorthttplibt HTTPExceptionterrt will_closetstatustreasont_handlert_hostt get_full_urlt_urlt _connectiontcodetmsgtheaderst HANDLE_ERRORStparent(RR0R@R'R R5((Rtdo_opensJ          cCsy |i|||i}Wnptiti fj o d}nMt o!t idd|t |n|ii||inX|djp|idjo.t ot id|t |nd}n%t ot id|t |n|S(sGstart the transaction with a re-used connection return a response object (r) upon success or None on failure. This DOES not close or remove bad connections in cases where it returns. However, if an unexpected exception occurs, it will close and remove the connection before re-raising. sunexpected exception - closing sconnection to %s (%d)i s&failed to re-use connection to %s (%d)sre-using connection to %s (%d)N(RR:R'R0R;R5R<R=R>R?RR7R R9R!RR(tversionR8(RR'R0R R5((RR4s*      cCsXy|io~|i}|id|i|iidp|i ddn|iidp|i ddt |qn|id|iWn0t i t ifj o}ti|nXx!|iiD]}|i |qWx-|iiD]\}}|i ||q W|i|io|i|ndS(NtPOSTs Content-types!application/x-www-form-urlencodedsContent-lengths%dtGET(R0thas_datatget_datatdataR't putrequestt get_selectorRKR t putheaderR%R<R=R>R?R@R2R3RRMt addheaderstargsR#tktvt endheaderstsend(RR'R0RYR@R[RTRZ((RR:/s*    "    cCstS(N(tNotImplementedError(RR ((RR6Fs( RRR R&R)R+R-R.RNR4R:R6(((RR s       0 + t HTTPHandlercBs#tZdZdZdZRS(NcCsti|dS(N(R R R(R((RR JscCs |i|S(N(RRNR0(RR0((Rt http_openMscCs t|S(N(tHTTPConnectionR (RR ((RR6Ps(RRR R`R6(((RR_Is  t HTTPSHandlercBs&tZddZdZdZRS(NcCs1ti||pti}n||_dS(N(R R Rt ssl_factoryt sslfactoryt get_factoryt _ssl_factory(RRc((RR Ts cCs |i|S(N(RRNR0(RR0((Rt https_openZscCs|ii|S(N(RRftget_https_connectionR (RR ((RR6]s(RRRR RgR6(((RRbSs  t HTTPResponsecBsntZddddZeiiZdZdZ dZ dZ ddZdd Z dd Z RS( NicCs|otii||||ntii||||i|_d|_ d|_ d|_ d|_ d|_ d|_d|_dS(Nti(tmethodR>RiR Rtsockt debugleveltfilenoRRIt_rbuft _rbufsizeRDRERGRH(RRlRmtstrictRk((RR ts       cCsR|ioD|iid|_|io |ii||i|iqNndS(N(RtfpR(RRDR-RERH(R((RR(s     cCs-|ii|i|idd|idS(NR(i(RRDR.RERHR((R((RR)scCs|iS(N(RRK(R((RR8scCs|iS(N(RRG(R((RtgeturlscCs|io\|dj oOt|i}||jo||8}qf|i| }|i||_|Sn|i|i|}d|_|S(NRj(RRotamtRR%tLtst _raw_read(RRtRuRv((Rtreads   icCs3d}|iid}x|djod|jot|ijn oe|i|i}|pPn|id}|djo|t|i}n|i||_qW|djot|i}n |d}d|jot|ijno |}n|i| |i|}|_|S(NRjs ii( RTRRotfindtitlimitR%RwRptnew(RR{RzR|RT((Rtreadlines$5   ' cCsfd}g}xS|i}|pPn|i||t|7}|o||joPqqW|S(Nii(ttotalRRR}tlineR R%tsizehint(RRRRR~((Rt readliness   (RRRR R>RiRxRwR(R)R8RsR}R(((RRi`s       RacBstZeZRS(N(RRRitresponse_class(((RRastHTTPSConnectioncBstZeZRS(N(RRRiR(((RRsc Cs+t} t} ti| }ti|hdd<dd<}xd D]}d|||fGH|aybti |} | i }| iy| i| i}}Wntj od \}}nXWn!tj o}d|GHqJXd||fGHqJW| a| i}dG|GH| idS( Nitoffitons. fancy error handling %s (HANDLE_ERRORS = %i)s EXCEPTION: %ss status = %s, reason = %ssopen connections:(ii(NN(RLtorigR_tkeepalive_handlerR2t build_openertopenertinstall_openertposRzturlopenturltfoRxtfooR(RBRCtAttributeErrorRtIOErrorteR&thostsR+( RRBRRRzRCRRRRRR((Rt error_handlers4       cCsJdk}d}ti}ti|ti|}|i }|i |i |}|d|ifGHtit}ti|ti|}|i }|i |i |}|d|ifGHti|}d}x'|i}|o||}qPqW|i |i |}|d|ifGHdS(Ns%25s: %ss normal urllibskeepalive readRjiskeepalive readline(tmd5tformatR2RRRRRRRxRR(R|tmt hexdigestR_R}tf(RRRRRRRR((Rt continuitys6          cCsd||fGHtiidti}ti|t ||}d|GHtiidtit }ti|t ||}d|GHd||fGHdS(Ns making %i connections to: %ss( first using the normal urllib handlerss TIME: %.3f ss( now using the keepalive handler s improvement factor: %.2f( tNRtsyststdouttwriteR2RRRtfetchtt1R_tt2(RRRRR((Rtcomps     c Csdk}g}|i}xqt|D]c}|o|djo|i|nti |} | i } | i|it| q(W|i|}d}x@|dD]4}|d}||djpd||fGHqqW|S(Niis+WARNING: inconsistent length on read %i: %i(ttimetlenst starttimetrangeRRztdelaytsleepR2RRRRxRR(R R%tdifftj( RRRRRRzRRRRR((RRs&       cCst}dfdY}|adGHti|}|i}|i d}d|GHxJ|djo<t i i d|t i itid|d8}q^Wt ii d d GHti|}|i}|i ||jo d GHnd GH|adS( Nt FakeLoggercBstZdZeZZZRS(NcGs ||GHdS(N(RJRY(RRJRY((Rtdebug2s(RRRR8twarningR=(((RR1s s- fetching the file to establish a connectionis; waiting %i seconds for the server to close the connectionis %2iis s! fetching the file a second times data are identicals ERROR: DATA DIFFER(R7tdbbackupRR2RRRRxtdata1R(RzRRRtflushRRtstderrtdata2(RRzRRRRR((Rt test_timeout.s2           i cCsqdGHyt|Wn$tj o}dGHtinXHdGHt|HdGHt||HdGHt |dS(Ns,checking error hander (do this on a non-200)s.exiting - exception will prevent further testss>performing continuity test (making sure stuff isn't corrupted)sperforming speed comparisons#performing dropped-connection check( RRRRRtexitRRRR(RRR((RttestPs  t__main__s%s (ii(RR2R>R<RRR7RdRt version_infoRLRR R_RbRiRaRRRRRRRRRtinttargvRR(RRRR_RRRiRRbR2RRaRR R>R<RRRdRRR((Rt?ds@       <  b  !   "    0707010002bc03000081a400000000000000020000000144bfe47e00004305000000b600010006ffffffffffffffff0000004000000000reloc/usr/lib/python2.4/vendor-packages/urlgrabber/byterange.py# This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., # 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko # $Id: byterange.py,v 1.12 2006/07/20 20:15:58 mstenner Exp $ import os import stat import urllib import urllib2 import rfc822 DEBUG = None try: from cStringIO import StringIO except ImportError, msg: from StringIO import StringIO class RangeError(IOError): """Error raised when an unsatisfiable range is requested.""" pass class HTTPRangeHandler(urllib2.BaseHandler): """Handler that enables HTTP Range headers. This was extremely simple. The Range header is a HTTP feature to begin with so all this class does is tell urllib2 that the "206 Partial Content" reponse from the HTTP server is what we expected. Example: import urllib2 import byterange range_handler = range.HTTPRangeHandler() opener = urllib2.build_opener(range_handler) # install it urllib2.install_opener(opener) # create Request and set Range header req = urllib2.Request('http://www.python.org/') req.header['Range'] = 'bytes=30-50' f = urllib2.urlopen(req) """ def http_error_206(self, req, fp, code, msg, hdrs): # 206 Partial Content Response r = urllib.addinfourl(fp, hdrs, req.get_full_url()) r.code = code r.msg = msg return r def http_error_416(self, req, fp, code, msg, hdrs): # HTTP's Range Not Satisfiable error raise RangeError('Requested Range Not Satisfiable') class HTTPSRangeHandler(HTTPRangeHandler): """ Range Header support for HTTPS. """ def https_error_206(self, req, fp, code, msg, hdrs): return self.http_error_206(req, fp, code, msg, hdrs) def https_error_416(self, req, fp, code, msg, hdrs): self.https_error_416(req, fp, code, msg, hdrs) class RangeableFileObject: """File object wrapper to enable raw range handling. This was implemented primarilary for handling range specifications for file:// urls. This object effectively makes a file object look like it consists only of a range of bytes in the stream. Examples: # expose 10 bytes, starting at byte position 20, from # /etc/aliases. >>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30)) # seek seeks within the range (to position 23 in this case) >>> fo.seek(3) # tell tells where your at _within the range_ (position 3 in # this case) >>> fo.tell() # read EOFs if an attempt is made to read past the last # byte in the range. the following will return only 7 bytes. >>> fo.read(30) """ def __init__(self, fo, rangetup): """Create a RangeableFileObject. fo -- a file like object. only the read() method need be supported but supporting an optimized seek() is preferable. rangetup -- a (firstbyte,lastbyte) tuple specifying the range to work over. The file object provided is assumed to be at byte offset 0. """ self.fo = fo (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup) self.realpos = 0 self._do_seek(self.firstbyte) def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for in self.fo. This includes methods.""" if hasattr(self.fo, name): return getattr(self.fo, name) raise AttributeError, name def tell(self): """Return the position within the range. This is different from fo.seek in that position 0 is the first byte position of the range tuple. For example, if this object was created with a range tuple of (500,899), tell() will return 0 when at byte position 500 of the file. """ return (self.realpos - self.firstbyte) def seek(self,offset,whence=0): """Seek within the byte range. Positioning is identical to that described under tell(). """ assert whence in (0, 1, 2) if whence == 0: # absolute seek realoffset = self.firstbyte + offset elif whence == 1: # relative seek realoffset = self.realpos + offset elif whence == 2: # absolute from end of file # XXX: are we raising the right Error here? raise IOError('seek from end of file not supported.') # do not allow seek past lastbyte in range if self.lastbyte and (realoffset >= self.lastbyte): realoffset = self.lastbyte self._do_seek(realoffset - self.realpos) def read(self, size=-1): """Read within the range. This method will limit the size read based on the range. """ size = self._calc_read_size(size) rslt = self.fo.read(size) self.realpos += len(rslt) return rslt def readline(self, size=-1): """Read lines within the range. This method will limit the size read based on the range. """ size = self._calc_read_size(size) rslt = self.fo.readline(size) self.realpos += len(rslt) return rslt def _calc_read_size(self, size): """Handles calculating the amount of data to read based on the range. """ if self.lastbyte: if size > -1: if ((self.realpos + size) >= self.lastbyte): size = (self.lastbyte - self.realpos) else: size = (self.lastbyte - self.realpos) return size def _do_seek(self,offset): """Seek based on whether wrapped object supports seek(). offset is relative to the current position (self.realpos). """ assert offset >= 0 if not hasattr(self.fo, 'seek'): self._poor_mans_seek(offset) else: self.fo.seek(self.realpos + offset) self.realpos+= offset def _poor_mans_seek(self,offset): """Seek by calling the wrapped file objects read() method. This is used for file like objects that do not have native seek support. The wrapped objects read() method is called to manually seek to the desired position. offset -- read this number of bytes from the wrapped file object. raise RangeError if we encounter EOF before reaching the specified offset. """ pos = 0 bufsize = 1024 while pos < offset: if (pos + bufsize) > offset: bufsize = offset - pos buf = self.fo.read(bufsize) if len(buf) != bufsize: raise RangeError('Requested Range Not Satisfiable') pos+= bufsize class FileRangeHandler(urllib2.FileHandler): """FileHandler subclass that adds Range support. This class handles Range headers exactly like an HTTP server would. """ def open_local_file(self, req): import mimetypes import mimetools host = req.get_host() file = req.get_selector() localfile = urllib.url2pathname(file) stats = os.stat(localfile) size = stats[stat.ST_SIZE] modified = rfc822.formatdate(stats[stat.ST_MTIME]) mtype = mimetypes.guess_type(file)[0] if host: host, port = urllib.splitport(host) if port or socket.gethostbyname(host) not in self.get_names(): raise urllib2.URLError('file not on local host') fo = open(localfile,'rb') brange = req.headers.get('Range',None) brange = range_header_to_tuple(brange) assert brange != () if brange: (fb,lb) = brange if lb == '': lb = size if fb < 0 or fb > size or lb > size: raise RangeError('Requested Range Not Satisfiable') size = (lb - fb) fo = RangeableFileObject(fo, (fb,lb)) headers = mimetools.Message(StringIO( 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % (mtype or 'text/plain', size, modified))) return urllib.addinfourl(fo, headers, 'file:'+file) # FTP Range Support # Unfortunately, a large amount of base FTP code had to be copied # from urllib and urllib2 in order to insert the FTP REST command. # Code modifications for range support have been commented as # follows: # -- range support modifications start/end here from urllib import splitport, splituser, splitpasswd, splitattr, \ unquote, addclosehook, addinfourl import ftplib import socket import sys import ftplib import mimetypes import mimetools class FTPRangeHandler(urllib2.FTPHandler): def ftp_open(self, req): host = req.get_host() if not host: raise IOError, ('ftp error', 'no host given') host, port = splitport(host) if port is None: port = ftplib.FTP_PORT # username/password handling user, host = splituser(host) if user: user, passwd = splitpasswd(user) else: passwd = None host = unquote(host) user = unquote(user or '') passwd = unquote(passwd or '') try: host = socket.gethostbyname(host) except socket.error, msg: raise urllib2.URLError(msg) path, attrs = splitattr(req.get_selector()) dirs = path.split('/') dirs = map(unquote, dirs) dirs, file = dirs[:-1], dirs[-1] if dirs and not dirs[0]: dirs = dirs[1:] try: fw = self.connect_ftp(user, passwd, host, port, dirs) type = file and 'I' or 'D' for attr in attrs: attr, value = splitattr(attr) if attr.lower() == 'type' and \ value in ('a', 'A', 'i', 'I', 'd', 'D'): type = value.upper() # -- range support modifications start here rest = None range_tup = range_header_to_tuple(req.headers.get('Range',None)) assert range_tup != () if range_tup: (fb,lb) = range_tup if fb > 0: rest = fb # -- range support modifications end here fp, retrlen = fw.retrfile(file, type, rest) # -- range support modifications start here if range_tup: (fb,lb) = range_tup if lb == '': if retrlen is None or retrlen == 0: raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.') lb = retrlen retrlen = lb - fb if retrlen < 0: # beginning of range is larger than file raise RangeError('Requested Range Not Satisfiable') else: retrlen = lb - fb fp = RangeableFileObject(fp, (0,retrlen)) # -- range support modifications end here headers = "" mtype = mimetypes.guess_type(req.get_full_url())[0] if mtype: headers += "Content-Type: %s\n" % mtype if retrlen is not None and retrlen >= 0: headers += "Content-Length: %d\n" % retrlen sf = StringIO(headers) headers = mimetools.Message(sf) return addinfourl(fp, headers, req.get_full_url()) except ftplib.all_errors, msg: raise IOError, ('ftp error', msg), sys.exc_info()[2] def connect_ftp(self, user, passwd, host, port, dirs): fw = ftpwrapper(user, passwd, host, port, dirs) return fw class ftpwrapper(urllib.ftpwrapper): # range support note: # this ftpwrapper code is copied directly from # urllib. The only enhancement is to add the rest # argument and pass it on to ftp.ntransfercmd def retrfile(self, file, type, rest=None): self.endtransfer() if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 else: cmd = 'TYPE ' + type; isdir = 0 try: self.ftp.voidcmd(cmd) except ftplib.all_errors: self.init() self.ftp.voidcmd(cmd) conn = None if file and not isdir: # Use nlst to see if the file exists at all try: self.ftp.nlst(file) except ftplib.error_perm, reason: raise IOError, ('ftp error', reason), sys.exc_info()[2] # Restore the transfer mode! self.ftp.voidcmd(cmd) # Try to retrieve as a file try: cmd = 'RETR ' + file conn = self.ftp.ntransfercmd(cmd, rest) except ftplib.error_perm, reason: if str(reason)[:3] == '501': # workaround for REST not supported error fp, retrlen = self.retrfile(file, type) fp = RangeableFileObject(fp, (rest,'')) return (fp, retrlen) elif str(reason)[:3] != '550': raise IOError, ('ftp error', reason), sys.exc_info()[2] if not conn: # Set transfer mode to ASCII! self.ftp.voidcmd('TYPE A') # Try a directory listing if file: cmd = 'LIST ' + file else: cmd = 'LIST' conn = self.ftp.ntransfercmd(cmd) self.busy = 1 # Pass back both a suitably decorated object and a retrieval length return (addclosehook(conn[0].makefile('rb'), self.endtransfer), conn[1]) #################################################################### # Range Tuple Functions # XXX: These range tuple functions might go better in a class. _rangere = None def range_header_to_tuple(range_header): """Get a (firstbyte,lastbyte) tuple from a Range header value. Range headers have the form "bytes=-". This function pulls the firstbyte and lastbyte values and returns a (firstbyte,lastbyte) tuple. If lastbyte is not specified in the header value, it is returned as an empty string in the tuple. Return None if range_header is None Return () if range_header does not conform to the range spec pattern. """ global _rangere if range_header is None: return None if _rangere is None: import re _rangere = re.compile(r'^bytes=(\d{1,})-(\d*)') match = _rangere.match(range_header) if match: tup = range_tuple_normalize(match.group(1,2)) if tup and tup[1]: tup = (tup[0],tup[1]+1) return tup return () def range_tuple_to_header(range_tup): """Convert a range tuple to a Range header value. Return a string of the form "bytes=-" or None if no range is needed. """ if range_tup is None: return None range_tup = range_tuple_normalize(range_tup) if range_tup: if range_tup[1]: range_tup = (range_tup[0],range_tup[1] - 1) return 'bytes=%s-%s' % range_tup def range_tuple_normalize(range_tup): """Normalize a (first_byte,last_byte) range tuple. Return a tuple whose first element is guaranteed to be an int and whose second element will be '' (meaning: the last byte) or an int. Finally, return None if the normalized tuple == (0,'') as that is equivelant to retrieving the entire file. """ if range_tup is None: return None # handle first byte fb = range_tup[0] if fb in (None,''): fb = 0 else: fb = int(fb) # handle last byte try: lb = range_tup[1] except IndexError: lb = '' else: if lb is None: lb = '' elif lb != '': lb = int(lb) # check if range is over the entire file if (fb,lb) == (0,''): return None # check that the range is valid if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb)) return (fb,lb) 0707010002bbfb000041ed0000000000000000000000024ae194b300000000000000b600010006ffffffffffffffff0000000e00000000reloc/usr/bin0707010002bbfc000081ed0000000000000002000000014ae194ad00001304000000b600010006ffffffffffffffff0000001900000000reloc/usr/bin/urlgrabber#!/usr/bin/python -t # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., # 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko """Usage: urlgrabber [OPTIONS] URL [FILE] OPTIONS --copy-local ignored except for file:// urls, in which case it specifies whether urlgrab should still make a copy of the file, or simply point to the existing copy. --throttle=NUMBER if it's an int, it's the bytes/second throttle limit. If it's a float, it is first multiplied by bandwidth. If throttle == 0, throttling is disabled. If None, the module-level default (which can be set with set_throttle) is used. --bandwidth=NUMBER the nominal max bandwidth in bytes/second. If throttle is a float and bandwidth == 0, throttling is disabled. If None, the module-level default (which can be set with set_bandwidth) is used. --range=RANGE a tuple of the form first_byte,last_byte describing a byte range to retrieve. Either or both of the values may be specified. If first_byte is None, byte offset 0 is assumed. If last_byte is None, the last byte available is assumed. Note that both first and last_byte values are inclusive so a range of (10,11) would return the 10th and 11th bytes of the resource. --user-agent=STR the user-agent string provide if the url is HTTP. RETRY OPTIONS --retry=NUMBER the number of times to retry the grab before bailing. If this is zero, it will retry forever. This was intentional... really, it was :). If this value is not supplied or is supplied but is None retrying does not occur. --retrycodes a sequence of errorcodes (values of e.errno) for which it should retry. See the doc on URLGrabError for more details on this. retrycodes defaults to -1,2,4,5,6,7 if not specified explicitly. """ # $Id: urlgrabber,v 1.5 2006/03/02 20:56:56 mstenner Exp $ import sys import urlgrabber.grabber from urlgrabber.grabber import urlgrab DEBUG=0 def main(): kwargs = {} url = None file = None proxies = {} for arg in sys.argv[1:]: if arg in ('--help','-h','-?'): print __doc__ sys.exit(0) elif arg in ('--debug', '-d'): global DEBUG DEBUG = 1 elif arg.startswith('--'): ls = arg[2:].split('=') a,v = (ls[0],len(ls) > 1 and ls[1] or 1) a = a.replace('-','_') if v is None: v = 1 if a in ('retrycodes','range'): v = v.split(',') if a.endswith('_proxy'): proxies[ a[0:a.find('_')] ] = v continue if DEBUG: print a, '=', v kwargs[a] = v elif not url: url = arg elif not file: file = arg else: print "Bad usage. Try %s --help" % (sys.argv[0],) sys.exit(99) if url is None: print "Bad usage. Try %s --help" % (sys.argv[0],) sys.exit(99) if len(proxies): kwargs['proxies'] = proxies if DEBUG: print 'kwargs: ', kwargs print 'URL: ', url print 'FILE: ', file try: from urlgrabber.progress import text_progress_meter except ImportError, e: pass else: kwargs['progress_obj'] = text_progress_meter() urlgrabber.grabber.DEBUG = DEBUG filename = urlgrab(url,filename=file,**kwargs) print "file written to %s" % (filename,) if __name__ == '__main__': main() 07070100000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000b00000000TRAILER!!!