Browse Source

2017-12-21 数据分析

jtahstu 1 year ago
parent
commit
aea6aadf5c

+ 184 - 181
.idea/workspace.xml

@@ -1,28 +1,19 @@
1 1
 <?xml version="1.0" encoding="UTF-8"?>
2 2
 <project version="4">
3 3
   <component name="ChangeListManager">
4
-    <list default="true" id="28855cfc-f511-4773-979a-c721c4e672b8" name="Default" comment="">
5
-      <change beforePath="" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_education.py" />
6
-      <change beforePath="" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_industry.py" />
7
-      <change beforePath="" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_keywords.py" />
4
+    <list default="true" id="28855cfc-f511-4773-979a-c721c4e672b8" name="Default" comment="2017-12-18 拉">
8 5
       <change beforePath="" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_salary.py" />
9
-      <change beforePath="" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tag.py" />
10 6
       <change beforePath="" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tool.py" />
11 7
       <change beforePath="" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_workyear.py" />
12
-      <change beforePath="" afterPath="$PROJECT_DIR$/www_zhipin_com/test/ip.py" />
13
-      <change beforePath="" afterPath="$PROJECT_DIR$/www_zhipin_com/test/ip_https.txt" />
8
+      <change beforePath="" afterPath="$PROJECT_DIR$/www_zhipin_com/spiders/lagou_list.py" />
14 9
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
15
-      <change beforePath="$PROJECT_DIR$/www_zhipin_com/analyse/CONSOLA.TTF" afterPath="" />
10
+      <change beforePath="$PROJECT_DIR$/README.md" afterPath="$PROJECT_DIR$/README.md" />
11
+      <change beforePath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_education.py" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_education.py" />
12
+      <change beforePath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_industry.py" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_industry.py" />
13
+      <change beforePath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tag.py" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tag.py" />
16 14
       <change beforePath="$PROJECT_DIR$/www_zhipin_com/analyse/clear_data.py" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/clear_data.py" />
17
-      <change beforePath="$PROJECT_DIR$/www_zhipin_com/analyse/eng.txt" afterPath="" />
18
-      <change beforePath="$PROJECT_DIR$/www_zhipin_com/analyse/stopwords.txt" afterPath="" />
19 15
       <change beforePath="$PROJECT_DIR$/www_zhipin_com/analyse/tool.py" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/tool.py" />
20
-      <change beforePath="$PROJECT_DIR$/www_zhipin_com/analyse/word_cloud2.py" afterPath="$PROJECT_DIR$/www_zhipin_com/analyse/word_cloud2.py" />
21
-      <change beforePath="$PROJECT_DIR$/www_zhipin_com/analyse/words_n6.txt" afterPath="" />
22
-      <change beforePath="$PROJECT_DIR$/www_zhipin_com/analyse/wordss.txt" afterPath="" />
23
-      <change beforePath="$PROJECT_DIR$/www_zhipin_com/pipelines.py" afterPath="$PROJECT_DIR$/www_zhipin_com/pipelines.py" />
24 16
       <change beforePath="$PROJECT_DIR$/www_zhipin_com/spiders/detail.py" afterPath="$PROJECT_DIR$/www_zhipin_com/spiders/detail.py" />
25
-      <change beforePath="$PROJECT_DIR$/www_zhipin_com/spiders/zhipin_spider.py" afterPath="$PROJECT_DIR$/www_zhipin_com/spiders/zhipin_spider.py" />
26 17
     </list>
27 18
     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
28 19
     <option name="TRACKING_ENABLED" value="true" />
@@ -33,14 +24,15 @@
33 24
   </component>
34 25
   <component name="CoverageDataManager">
35 26
     <SUITE FILE_PATH="coverage/www_zhipin_com$jieba_1.coverage" NAME="jieba_1 Coverage Results" MODIFIED="1513045174091" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/test" />
36
-    <SUITE FILE_PATH="coverage/www_zhipin_com$analyse_education.coverage" NAME="analyse_education Coverage Results" MODIFIED="1513320618047" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
37
-    <SUITE FILE_PATH="coverage/www_zhipin_com$analyse_industry.coverage" NAME="analyse_industry Coverage Results" MODIFIED="1513320678050" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
38
-    <SUITE FILE_PATH="coverage/www_zhipin_com$clear_data.coverage" NAME="clear_data Coverage Results" MODIFIED="1513500676971" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
27
+    <SUITE FILE_PATH="coverage/www_zhipin_com$analyse_education.coverage" NAME="analyse_education Coverage Results" MODIFIED="1513655308208" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
28
+    <SUITE FILE_PATH="coverage/www_zhipin_com$analyse_industry.coverage" NAME="analyse_industry Coverage Results" MODIFIED="1513655043158" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
29
+    <SUITE FILE_PATH="coverage/www_zhipin_com$clear_data.coverage" NAME="clear_data Coverage Results" MODIFIED="1513592966996" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
39 30
     <SUITE FILE_PATH="coverage/www_zhipin_com$ip.coverage" NAME="ip Coverage Results" MODIFIED="1513347893281" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/test" />
31
+    <SUITE FILE_PATH="coverage/www_zhipin_com$lagou_list.coverage" NAME="lagou_list Coverage Results" MODIFIED="1513584755997" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/spiders" />
40 32
     <SUITE FILE_PATH="coverage/www_zhipin_com$analyse_salary.coverage" NAME="analyse_salary Coverage Results" MODIFIED="1513434288349" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
41 33
     <SUITE FILE_PATH="coverage/www_zhipin_com$jieba_test.coverage" NAME="jieba_test Coverage Results" MODIFIED="1513075610169" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/test" />
42
-    <SUITE FILE_PATH="coverage/www_zhipin_com$analyse_tag.coverage" NAME="analyse_tag Coverage Results" MODIFIED="1513319121897" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
43
-    <SUITE FILE_PATH="coverage/www_zhipin_com$tool.coverage" NAME="tool Coverage Results" MODIFIED="1513498686031" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
34
+    <SUITE FILE_PATH="coverage/www_zhipin_com$tool.coverage" NAME="tool Coverage Results" MODIFIED="1513587529119" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
35
+    <SUITE FILE_PATH="coverage/www_zhipin_com$analyse_tag.coverage" NAME="analyse_tag Coverage Results" MODIFIED="1513652706494" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
44 36
     <SUITE FILE_PATH="coverage/www_zhipin_com$detail.coverage" NAME="detail Coverage Results" MODIFIED="1512988204310" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
45 37
     <SUITE FILE_PATH="coverage/www_zhipin_com$xx.coverage" NAME="xx Coverage Results" MODIFIED="1513327774580" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/test" />
46 38
     <SUITE FILE_PATH="coverage/www_zhipin_com$word_cloud.coverage" NAME="word_cloud Coverage Results" MODIFIED="1513072573978" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/www_zhipin_com/analyse" />
@@ -59,50 +51,50 @@
59 51
       <file leaf-file-name="zhipin_spider.py" pinned="false" current-in-tab="false">
60 52
         <entry file="file://$PROJECT_DIR$/www_zhipin_com/spiders/zhipin_spider.py">
61 53
           <provider selected="true" editor-type-id="text-editor">
62
-            <state relative-caret-position="0">
63
-              <caret line="29" column="32" lean-forward="false" selection-start-line="29" selection-start-column="32" selection-end-line="29" selection-end-column="32" />
54
+            <state relative-caret-position="-164">
55
+              <caret line="73" column="50" lean-forward="false" selection-start-line="73" selection-start-column="50" selection-end-line="73" selection-end-column="50" />
64 56
               <folding />
65 57
             </state>
66 58
           </provider>
67 59
         </entry>
68 60
       </file>
69
-      <file leaf-file-name="analyse_workyear.py" pinned="false" current-in-tab="true">
70
-        <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_workyear.py">
61
+      <file leaf-file-name="analyse_education.py" pinned="false" current-in-tab="false">
62
+        <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_education.py">
71 63
           <provider selected="true" editor-type-id="text-editor">
72
-            <state relative-caret-position="288">
73
-              <caret line="12" column="48" lean-forward="false" selection-start-line="12" selection-start-column="48" selection-end-line="12" selection-end-column="48" />
74
-              <folding>
75
-                <element signature="e#106#131#0" expanded="true" />
76
-              </folding>
64
+            <state relative-caret-position="144">
65
+              <caret line="6" column="25" lean-forward="true" selection-start-line="6" selection-start-column="25" selection-end-line="6" selection-end-column="25" />
66
+              <folding />
77 67
             </state>
78 68
           </provider>
79 69
         </entry>
80 70
       </file>
81
-      <file leaf-file-name="analyse_tool.py" pinned="false" current-in-tab="false">
82
-        <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tool.py">
71
+      <file leaf-file-name="analyse_salary.py" pinned="false" current-in-tab="true">
72
+        <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_salary.py">
83 73
           <provider selected="true" editor-type-id="text-editor">
84
-            <state relative-caret-position="312">
85
-              <caret line="13" column="0" lean-forward="false" selection-start-line="13" selection-start-column="0" selection-end-line="13" selection-end-column="0" />
74
+            <state relative-caret-position="432">
75
+              <caret line="21" column="0" lean-forward="true" selection-start-line="21" selection-start-column="0" selection-end-line="21" selection-end-column="0" />
86 76
               <folding />
87 77
             </state>
88 78
           </provider>
89 79
         </entry>
90 80
       </file>
91
-      <file leaf-file-name="analyse_salary.py" pinned="false" current-in-tab="false">
92
-        <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_salary.py">
81
+      <file leaf-file-name="clear_data.py" pinned="false" current-in-tab="false">
82
+        <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/clear_data.py">
93 83
           <provider selected="true" editor-type-id="text-editor">
94
-            <state relative-caret-position="620">
95
-              <caret line="78" column="26" lean-forward="true" selection-start-line="78" selection-start-column="26" selection-end-line="78" selection-end-column="26" />
96
-              <folding />
84
+            <state relative-caret-position="758">
85
+              <caret line="115" column="0" lean-forward="false" selection-start-line="115" selection-start-column="0" selection-end-line="125" selection-end-column="15" />
86
+              <folding>
87
+                <element signature="e#132#147#0" expanded="true" />
88
+              </folding>
97 89
             </state>
98 90
           </provider>
99 91
         </entry>
100 92
       </file>
101
-      <file leaf-file-name="clear_data.py" pinned="false" current-in-tab="false">
102
-        <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/clear_data.py">
93
+      <file leaf-file-name="lagou_list.py" pinned="false" current-in-tab="false">
94
+        <entry file="file://$PROJECT_DIR$/www_zhipin_com/spiders/lagou_list.py">
103 95
           <provider selected="true" editor-type-id="text-editor">
104
-            <state relative-caret-position="-1300">
105
-              <caret line="105" column="15" lean-forward="false" selection-start-line="105" selection-start-column="15" selection-end-line="105" selection-end-column="15" />
96
+            <state relative-caret-position="-1340">
97
+              <caret line="18" column="31" lean-forward="false" selection-start-line="18" selection-start-column="31" selection-end-line="18" selection-end-column="31" />
106 98
               <folding />
107 99
             </state>
108 100
           </provider>
@@ -139,6 +131,12 @@
139 131
       <find>date</find>
140 132
       <find>insert</find>
141 133
       <find>jtahstu</find>
134
+      <find>format</find>
135
+      <find>find</find>
136
+      <find>distin</find>
137
+      <find>sort</find>
138
+      <find>jtahstu:jtahstu@</find>
139
+      <find>jtahstu:jt</find>
142 140
     </findStrings>
143 141
     <replaceStrings>
144 142
       <replace />
@@ -197,9 +195,6 @@
197 195
         <option value="$PROJECT_DIR$/www_zhipin_com/settings.py" />
198 196
         <option value="$PROJECT_DIR$/www_zhipin_com/analyse/words_n.txt" />
199 197
         <option value="$PROJECT_DIR$/www_zhipin_com/analyse/stopwords.txt" />
200
-        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tag.py" />
201
-        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_education.py" />
202
-        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_industry.py" />
203 198
         <option value="$PROJECT_DIR$/www_zhipin_com/test/xx.py" />
204 199
         <option value="$PROJECT_DIR$/www_zhipin_com/test/ip.py" />
205 200
         <option value="$PROJECT_DIR$/www_zhipin_com/test/ip_https.txt" />
@@ -211,11 +206,16 @@
211 206
         <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tool.py" />
212 207
         <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_keywords.py" />
213 208
         <option value="$PROJECT_DIR$/www_zhipin_com/spiders/detail.py" />
214
-        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_workyear.py" />
215
-        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/tool.py" />
216 209
         <option value="$PROJECT_DIR$/www_zhipin_com/analyse/word_cloud2.py" />
217
-        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_salary.py" />
210
+        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/tool.py" />
211
+        <option value="$PROJECT_DIR$/www_zhipin_com/spiders/lagou_list.py" />
212
+        <option value="$PROJECT_DIR$/README.md" />
213
+        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_workyear.py" />
218 214
         <option value="$PROJECT_DIR$/www_zhipin_com/analyse/clear_data.py" />
215
+        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tag.py" />
216
+        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_industry.py" />
217
+        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_education.py" />
218
+        <option value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_salary.py" />
219 219
       </list>
220 220
     </option>
221 221
   </component>
@@ -225,10 +225,10 @@
225 225
     <detection-done>true</detection-done>
226 226
     <sorting>DEFINITION_ORDER</sorting>
227 227
   </component>
228
-  <component name="ProjectFrameBounds">
229
-    <option name="y" value="24" />
228
+  <component name="ProjectFrameBounds" extendedState="6">
229
+    <option name="y" value="22" />
230 230
     <option name="width" value="1898" />
231
-    <option name="height" value="1011" />
231
+    <option name="height" value="983" />
232 232
   </component>
233 233
   <component name="ProjectInspectionProfilesVisibleTreeState">
234 234
     <entry key="Project Default">
@@ -268,6 +268,7 @@
268 268
       <foldersAlwaysOnTop value="true" />
269 269
     </navigator>
270 270
     <panes>
271
+      <pane id="Scratches" />
271 272
       <pane id="ProjectPane">
272 273
         <subPane>
273 274
           <expand>
@@ -286,11 +287,16 @@
286 287
               <item name="www_zhipin_com" type="462c0819:PsiDirectoryNode" />
287 288
               <item name="analyse" type="462c0819:PsiDirectoryNode" />
288 289
             </path>
290
+            <path>
291
+              <item name="www_zhipin_com" type="b2602c69:ProjectViewProjectNode" />
292
+              <item name="www_zhipin_com" type="462c0819:PsiDirectoryNode" />
293
+              <item name="www_zhipin_com" type="462c0819:PsiDirectoryNode" />
294
+              <item name="spiders" type="462c0819:PsiDirectoryNode" />
295
+            </path>
289 296
           </expand>
290 297
           <select />
291 298
         </subPane>
292 299
       </pane>
293
-      <pane id="Scratches" />
294 300
       <pane id="Scope" />
295 301
     </panes>
296 302
   </component>
@@ -321,8 +327,8 @@
321 327
       </list>
322 328
     </option>
323 329
   </component>
324
-  <component name="RunManager" selected="Python.clear_data">
325
-    <configuration name="analyse_salary" type="PythonConfigurationType" factoryName="Python" temporary="true">
330
+  <component name="RunManager" selected="Python.analyse_education">
331
+    <configuration name="analyse_education" type="PythonConfigurationType" factoryName="Python" temporary="true">
326 332
       <option name="INTERPRETER_OPTIONS" value="" />
327 333
       <option name="PARENT_ENVS" value="true" />
328 334
       <envs>
@@ -335,13 +341,13 @@
335 341
       <option name="ADD_SOURCE_ROOTS" value="true" />
336 342
       <module name="www_zhipin_com" />
337 343
       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
338
-      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_salary.py" />
344
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_education.py" />
339 345
       <option name="PARAMETERS" value="" />
340 346
       <option name="SHOW_COMMAND_LINE" value="false" />
341 347
       <option name="EMULATE_TERMINAL" value="false" />
342 348
       <option name="MODULE_MODE" value="false" />
343 349
     </configuration>
344
-    <configuration name="analyse_workyear" type="PythonConfigurationType" factoryName="Python" temporary="true">
350
+    <configuration name="analyse_industry" type="PythonConfigurationType" factoryName="Python" temporary="true">
345 351
       <option name="INTERPRETER_OPTIONS" value="" />
346 352
       <option name="PARENT_ENVS" value="true" />
347 353
       <envs>
@@ -354,13 +360,13 @@
354 360
       <option name="ADD_SOURCE_ROOTS" value="true" />
355 361
       <module name="www_zhipin_com" />
356 362
       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
357
-      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_workyear.py" />
363
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_industry.py" />
358 364
       <option name="PARAMETERS" value="" />
359 365
       <option name="SHOW_COMMAND_LINE" value="false" />
360 366
       <option name="EMULATE_TERMINAL" value="false" />
361 367
       <option name="MODULE_MODE" value="false" />
362 368
     </configuration>
363
-    <configuration name="clear_data" type="PythonConfigurationType" factoryName="Python" temporary="true">
369
+    <configuration name="analyse_tag" type="PythonConfigurationType" factoryName="Python" temporary="true">
364 370
       <option name="INTERPRETER_OPTIONS" value="" />
365 371
       <option name="PARENT_ENVS" value="true" />
366 372
       <envs>
@@ -373,13 +379,13 @@
373 379
       <option name="ADD_SOURCE_ROOTS" value="true" />
374 380
       <module name="www_zhipin_com" />
375 381
       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
376
-      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/www_zhipin_com/analyse/clear_data.py" />
382
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tag.py" />
377 383
       <option name="PARAMETERS" value="" />
378 384
       <option name="SHOW_COMMAND_LINE" value="false" />
379 385
       <option name="EMULATE_TERMINAL" value="false" />
380 386
       <option name="MODULE_MODE" value="false" />
381 387
     </configuration>
382
-    <configuration name="tool" type="PythonConfigurationType" factoryName="Python" temporary="true">
388
+    <configuration name="clear_data" type="PythonConfigurationType" factoryName="Python" temporary="true">
383 389
       <option name="INTERPRETER_OPTIONS" value="" />
384 390
       <option name="PARENT_ENVS" value="true" />
385 391
       <envs>
@@ -392,13 +398,13 @@
392 398
       <option name="ADD_SOURCE_ROOTS" value="true" />
393 399
       <module name="www_zhipin_com" />
394 400
       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
395
-      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/www_zhipin_com/analyse/tool.py" />
401
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/www_zhipin_com/analyse/clear_data.py" />
396 402
       <option name="PARAMETERS" value="" />
397 403
       <option name="SHOW_COMMAND_LINE" value="false" />
398 404
       <option name="EMULATE_TERMINAL" value="false" />
399 405
       <option name="MODULE_MODE" value="false" />
400 406
     </configuration>
401
-    <configuration name="word_cloud2" type="PythonConfigurationType" factoryName="Python" temporary="true">
407
+    <configuration name="tool" type="PythonConfigurationType" factoryName="Python" temporary="true">
402 408
       <option name="INTERPRETER_OPTIONS" value="" />
403 409
       <option name="PARENT_ENVS" value="true" />
404 410
       <envs>
@@ -411,26 +417,26 @@
411 417
       <option name="ADD_SOURCE_ROOTS" value="true" />
412 418
       <module name="www_zhipin_com" />
413 419
       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
414
-      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/www_zhipin_com/analyse/word_cloud2.py" />
420
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/www_zhipin_com/analyse/tool.py" />
415 421
       <option name="PARAMETERS" value="" />
416 422
       <option name="SHOW_COMMAND_LINE" value="false" />
417 423
       <option name="EMULATE_TERMINAL" value="false" />
418 424
       <option name="MODULE_MODE" value="false" />
419 425
     </configuration>
420 426
     <list size="5">
421
-      <item index="0" class="java.lang.String" itemvalue="Python.analyse_workyear" />
422
-      <item index="1" class="java.lang.String" itemvalue="Python.analyse_salary" />
423
-      <item index="2" class="java.lang.String" itemvalue="Python.tool" />
424
-      <item index="3" class="java.lang.String" itemvalue="Python.word_cloud2" />
425
-      <item index="4" class="java.lang.String" itemvalue="Python.clear_data" />
427
+      <item index="0" class="java.lang.String" itemvalue="Python.tool" />
428
+      <item index="1" class="java.lang.String" itemvalue="Python.clear_data" />
429
+      <item index="2" class="java.lang.String" itemvalue="Python.analyse_tag" />
430
+      <item index="3" class="java.lang.String" itemvalue="Python.analyse_industry" />
431
+      <item index="4" class="java.lang.String" itemvalue="Python.analyse_education" />
426 432
     </list>
427 433
     <recent_temporary>
428 434
       <list size="5">
429
-        <item index="0" class="java.lang.String" itemvalue="Python.clear_data" />
430
-        <item index="1" class="java.lang.String" itemvalue="Python.word_cloud2" />
431
-        <item index="2" class="java.lang.String" itemvalue="Python.tool" />
432
-        <item index="3" class="java.lang.String" itemvalue="Python.analyse_salary" />
433
-        <item index="4" class="java.lang.String" itemvalue="Python.analyse_workyear" />
435
+        <item index="0" class="java.lang.String" itemvalue="Python.analyse_education" />
436
+        <item index="1" class="java.lang.String" itemvalue="Python.analyse_industry" />
437
+        <item index="2" class="java.lang.String" itemvalue="Python.analyse_tag" />
438
+        <item index="3" class="java.lang.String" itemvalue="Python.clear_data" />
439
+        <item index="4" class="java.lang.String" itemvalue="Python.tool" />
434 440
       </list>
435 441
     </recent_temporary>
436 442
   </component>
@@ -455,20 +461,29 @@
455 461
       <option name="project" value="LOCAL" />
456 462
       <updated>1513313181373</updated>
457 463
     </task>
458
-    <option name="localTasksCounter" value="2" />
464
+    <task id="LOCAL-00002" summary="2017-12-14 数据分析、爬虫完善等">
465
+      <created>1513512505664</created>
466
+      <option name="number" value="00002" />
467
+      <option name="presentableId" value="LOCAL-00002" />
468
+      <option name="project" value="LOCAL" />
469
+      <updated>1513512505665</updated>
470
+    </task>
471
+    <option name="localTasksCounter" value="3" />
459 472
     <servers />
460 473
   </component>
461 474
   <component name="ToolWindowManager">
462
-    <frame x="0" y="24" width="1898" height="1011" extended-state="0" />
475
+    <frame x="0" y="22" width="1680" height="983" extended-state="6" />
463 476
     <editor active="true" />
464 477
     <layout>
465 478
       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
466 479
       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
480
+      <window_info id="Documentation" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.12087912" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" x="0" y="0" width="424" height="783" />
481
+      <window_info id="File Transfer" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.17314488" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
467 482
       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32979977" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
468 483
       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32979977" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
469
-      <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.24401368" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
484
+      <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.24028268" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
470 485
       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32979977" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
471
-      <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.22790948" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
486
+      <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.23076923" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
472 487
       <window_info id="Docker" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
473 488
       <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
474 489
       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32839224" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
@@ -480,7 +495,6 @@
480 495
       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
481 496
       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
482 497
       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
483
-      <window_info id="File Transfer" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.17314488" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
484 498
       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
485 499
       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
486 500
     </layout>
@@ -502,14 +516,6 @@
502 516
     <watches-manager />
503 517
   </component>
504 518
   <component name="editorHistoryManager">
505
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/items.py">
506
-      <provider selected="true" editor-type-id="text-editor">
507
-        <state relative-caret-position="305">
508
-          <caret line="21" column="34" lean-forward="true" selection-start-line="21" selection-start-column="34" selection-end-line="21" selection-end-column="34" />
509
-        </state>
510
-      </provider>
511
-    </entry>
512
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/word.txt" />
513 519
     <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words.txt" />
514 520
     <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/word_cloud.py">
515 521
       <provider selected="true" editor-type-id="text-editor">
@@ -543,31 +549,7 @@
543 549
         </state>
544 550
       </provider>
545 551
     </entry>
546
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_industry.py">
547
-      <provider selected="true" editor-type-id="text-editor">
548
-        <state relative-caret-position="282">
549
-          <caret line="20" column="50" lean-forward="true" selection-start-line="20" selection-start-column="50" selection-end-line="20" selection-end-column="50" />
550
-          <folding />
551
-        </state>
552
-      </provider>
553
-    </entry>
554
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tag.py">
555
-      <provider selected="true" editor-type-id="text-editor">
556
-        <state relative-caret-position="144">
557
-          <caret line="6" column="25" lean-forward="true" selection-start-line="6" selection-start-column="25" selection-end-line="6" selection-end-column="25" />
558
-          <folding />
559
-        </state>
560
-      </provider>
561
-    </entry>
562 552
     <entry file="file://$PROJECT_DIR$/www_zhipin_com/test/xx.py" />
563
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_education.py">
564
-      <provider selected="true" editor-type-id="text-editor">
565
-        <state relative-caret-position="520">
566
-          <caret line="27" column="4" lean-forward="true" selection-start-line="27" selection-start-column="4" selection-end-line="27" selection-end-column="45" />
567
-          <folding />
568
-        </state>
569
-      </provider>
570
-    </entry>
571 553
     <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words_eng1.txt" />
572 554
     <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words_eng2.txt" />
573 555
     <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words_eng3.txt">
@@ -592,14 +574,6 @@
592 574
         </state>
593 575
       </provider>
594 576
     </entry>
595
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/settings.py">
596
-      <provider selected="true" editor-type-id="text-editor">
597
-        <state relative-caret-position="1224">
598
-          <caret line="51" column="0" lean-forward="false" selection-start-line="41" selection-start-column="4" selection-end-line="51" selection-end-column="0" />
599
-          <folding />
600
-        </state>
601
-      </provider>
602
-    </entry>
603 577
     <entry file="file://$PROJECT_DIR$/www_zhipin_com/data/iApp.jobs_java.json" />
604 578
     <entry file="file://$PROJECT_DIR$/www_zhipin_com/data/jobs_frontend_20171215.json">
605 579
       <provider selected="true" editor-type-id="text-editor">
@@ -622,148 +596,177 @@
622 596
         </state>
623 597
       </provider>
624 598
     </entry>
625
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/pipelines.py">
599
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/data/iApp.jobs_php_analyse.json" />
600
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/data/jobs_php_20171216.json">
626 601
       <provider selected="true" editor-type-id="text-editor">
627
-        <state relative-caret-position="304">
628
-          <caret line="40" column="56" lean-forward="true" selection-start-line="40" selection-start-column="19" selection-end-line="40" selection-end-column="56" />
629
-          <folding>
630
-            <element signature="e#192#205#0" expanded="false" />
631
-          </folding>
602
+        <state relative-caret-position="-2081">
603
+          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
604
+          <folding />
632 605
         </state>
633 606
       </provider>
634 607
     </entry>
635
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/data/iApp.jobs_php_analyse.json" />
636
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_keywords.py">
608
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/stopwords.txt" />
609
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words_n6.txt" />
610
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/wordss.txt" />
611
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words1.txt" />
612
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/1.json" />
613
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words2.txt" />
614
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words3.txt">
637 615
       <provider selected="true" editor-type-id="text-editor">
638
-        <state relative-caret-position="336">
639
-          <caret line="14" column="0" lean-forward="false" selection-start-line="14" selection-start-column="0" selection-end-line="14" selection-end-column="41" />
640
-          <folding>
641
-            <element signature="e#106#121#0" expanded="false" />
642
-          </folding>
616
+        <state relative-caret-position="456">
617
+          <caret line="0" column="3246" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="356716" />
643 618
         </state>
644 619
       </provider>
645 620
     </entry>
646
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/spiders/zhipin_spider.py">
621
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/dict.txt">
647 622
       <provider selected="true" editor-type-id="text-editor">
648
-        <state relative-caret-position="0">
649
-          <caret line="29" column="32" lean-forward="false" selection-start-line="29" selection-start-column="32" selection-end-line="29" selection-end-column="32" />
623
+        <state relative-caret-position="-244">
624
+          <caret line="1203" column="9" lean-forward="true" selection-start-line="1203" selection-start-column="9" selection-end-line="1203" selection-end-column="9" />
650 625
           <folding />
651 626
         </state>
652 627
       </provider>
653 628
     </entry>
654
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/data/jobs_php_20171216.json">
629
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/test/jieba_test.py">
655 630
       <provider selected="true" editor-type-id="text-editor">
656
-        <state relative-caret-position="-2081">
657
-          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
631
+        <state relative-caret-position="-2428">
632
+          <caret line="8" column="0" lean-forward="false" selection-start-line="8" selection-start-column="0" selection-end-line="8" selection-end-column="11" />
633
+          <folding />
658 634
         </state>
659 635
       </provider>
660 636
     </entry>
661
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/stopwords.txt" />
662
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words_n6.txt" />
663
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/wordss.txt" />
664
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/spiders/detail.py">
637
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/word_cloud2.py">
665 638
       <provider selected="true" editor-type-id="text-editor">
666
-        <state relative-caret-position="-49">
667
-          <caret line="11" column="11" lean-forward="true" selection-start-line="11" selection-start-column="11" selection-end-line="11" selection-end-column="11" />
639
+        <state relative-caret-position="339">
640
+          <caret line="85" column="50" lean-forward="false" selection-start-line="85" selection-start-column="50" selection-end-line="85" selection-end-column="50" />
668 641
           <folding />
669 642
         </state>
670 643
       </provider>
671 644
     </entry>
672
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words1.txt">
645
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/settings.py">
673 646
       <provider selected="true" editor-type-id="text-editor">
674
-        <state relative-caret-position="240">
675
-          <caret line="0" column="1150" lean-forward="true" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="767650" />
647
+        <state relative-caret-position="2400">
648
+          <caret line="100" column="40" lean-forward="true" selection-start-line="100" selection-start-column="40" selection-end-line="100" selection-end-column="40" />
676 649
           <folding />
677 650
         </state>
678 651
       </provider>
679 652
     </entry>
680
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/1.json">
653
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tool.py">
681 654
       <provider selected="true" editor-type-id="text-editor">
682
-        <state relative-caret-position="0">
683
-          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
655
+        <state relative-caret-position="312">
656
+          <caret line="13" column="0" lean-forward="false" selection-start-line="13" selection-start-column="0" selection-end-line="13" selection-end-column="0" />
684 657
           <folding />
685 658
         </state>
686 659
       </provider>
687 660
     </entry>
688
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words2.txt">
661
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/spiders/detail.py">
689 662
       <provider selected="true" editor-type-id="text-editor">
690
-        <state relative-caret-position="422">
691
-          <caret line="0" column="18384" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="437599" />
692
-          <folding />
663
+        <state relative-caret-position="1244">
664
+          <caret line="68" column="71" lean-forward="true" selection-start-line="68" selection-start-column="71" selection-end-line="68" selection-end-column="71" />
665
+          <folding>
666
+            <element signature="e#130#143#0" expanded="false" />
667
+          </folding>
693 668
         </state>
694 669
       </provider>
695 670
     </entry>
696
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/words3.txt">
671
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/pipelines.py">
697 672
       <provider selected="true" editor-type-id="text-editor">
698
-        <state relative-caret-position="456">
699
-          <caret line="0" column="3246" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="356716" />
673
+        <state relative-caret-position="406">
674
+          <caret line="40" column="56" lean-forward="true" selection-start-line="40" selection-start-column="19" selection-end-line="40" selection-end-column="56" />
700 675
           <folding />
701 676
         </state>
702 677
       </provider>
703 678
     </entry>
704
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/dict.txt">
679
+    <entry file="file://$PROJECT_DIR$/README.md">
705 680
       <provider selected="true" editor-type-id="text-editor">
706
-        <state relative-caret-position="-244">
707
-          <caret line="1203" column="9" lean-forward="true" selection-start-line="1203" selection-start-column="9" selection-end-line="1203" selection-end-column="9" />
681
+        <state relative-caret-position="192">
682
+          <caret line="8" column="0" lean-forward="false" selection-start-line="8" selection-start-column="0" selection-end-line="8" selection-end-column="0" />
708 683
           <folding />
709 684
         </state>
710 685
       </provider>
711 686
     </entry>
712 687
     <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/tool.py">
713 688
       <provider selected="true" editor-type-id="text-editor">
714
-        <state relative-caret-position="384">
715
-          <caret line="16" column="0" lean-forward="false" selection-start-line="16" selection-start-column="0" selection-end-line="16" selection-end-column="0" />
689
+        <state relative-caret-position="200">
690
+          <caret line="9" column="2" lean-forward="false" selection-start-line="9" selection-start-column="2" selection-end-line="9" selection-end-column="69" />
716 691
           <folding />
717 692
         </state>
718 693
       </provider>
719 694
     </entry>
720
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/test/jieba_test.py">
695
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_workyear.py">
721 696
       <provider selected="true" editor-type-id="text-editor">
722
-        <state relative-caret-position="-2428">
723
-          <caret line="8" column="0" lean-forward="false" selection-start-line="8" selection-start-column="0" selection-end-line="8" selection-end-column="11" />
697
+        <state relative-caret-position="543">
698
+          <caret line="35" column="24" lean-forward="false" selection-start-line="35" selection-start-column="24" selection-end-line="35" selection-end-column="24" />
699
+          <folding>
700
+            <element signature="e#106#131#0" expanded="true" />
701
+          </folding>
702
+        </state>
703
+      </provider>
704
+    </entry>
705
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_keywords.py">
706
+      <provider selected="true" editor-type-id="text-editor">
707
+        <state relative-caret-position="480">
708
+          <caret line="26" column="31" lean-forward="true" selection-start-line="26" selection-start-column="31" selection-end-line="26" selection-end-column="31" />
724 709
           <folding />
725 710
         </state>
726 711
       </provider>
727 712
     </entry>
728
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/word_cloud2.py">
713
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tag.py">
729 714
       <provider selected="true" editor-type-id="text-editor">
730
-        <state relative-caret-position="339">
731
-          <caret line="85" column="50" lean-forward="false" selection-start-line="85" selection-start-column="50" selection-end-line="85" selection-end-column="50" />
715
+        <state relative-caret-position="295">
716
+          <caret line="20" column="27" lean-forward="true" selection-start-line="18" selection-start-column="4" selection-end-line="20" selection-end-column="27" />
717
+          <folding>
718
+            <element signature="e#106#131#0" expanded="true" />
719
+          </folding>
720
+        </state>
721
+      </provider>
722
+    </entry>
723
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_industry.py">
724
+      <provider selected="true" editor-type-id="text-editor">
725
+        <state relative-caret-position="241">
726
+          <caret line="24" column="0" lean-forward="true" selection-start-line="24" selection-start-column="0" selection-end-line="24" selection-end-column="0" />
732 727
           <folding />
733 728
         </state>
734 729
       </provider>
735 730
     </entry>
736 731
     <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/clear_data.py">
737 732
       <provider selected="true" editor-type-id="text-editor">
738
-        <state relative-caret-position="-1300">
739
-          <caret line="105" column="15" lean-forward="false" selection-start-line="105" selection-start-column="15" selection-end-line="105" selection-end-column="15" />
733
+        <state relative-caret-position="758">
734
+          <caret line="115" column="0" lean-forward="false" selection-start-line="115" selection-start-column="0" selection-end-line="125" selection-end-column="15" />
735
+          <folding>
736
+            <element signature="e#132#147#0" expanded="true" />
737
+          </folding>
738
+        </state>
739
+      </provider>
740
+    </entry>
741
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/spiders/zhipin_spider.py">
742
+      <provider selected="true" editor-type-id="text-editor">
743
+        <state relative-caret-position="-164">
744
+          <caret line="73" column="50" lean-forward="false" selection-start-line="73" selection-start-column="50" selection-end-line="73" selection-end-column="50" />
740 745
           <folding />
741 746
         </state>
742 747
       </provider>
743 748
     </entry>
744
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_salary.py">
749
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/spiders/lagou_list.py">
745 750
       <provider selected="true" editor-type-id="text-editor">
746
-        <state relative-caret-position="620">
747
-          <caret line="78" column="26" lean-forward="true" selection-start-line="78" selection-start-column="26" selection-end-line="78" selection-end-column="26" />
751
+        <state relative-caret-position="-1340">
752
+          <caret line="18" column="31" lean-forward="false" selection-start-line="18" selection-start-column="31" selection-end-line="18" selection-end-column="31" />
748 753
           <folding />
749 754
         </state>
750 755
       </provider>
751 756
     </entry>
752
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_tool.py">
757
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_education.py">
753 758
       <provider selected="true" editor-type-id="text-editor">
754
-        <state relative-caret-position="312">
755
-          <caret line="13" column="0" lean-forward="false" selection-start-line="13" selection-start-column="0" selection-end-line="13" selection-end-column="0" />
759
+        <state relative-caret-position="144">
760
+          <caret line="6" column="25" lean-forward="true" selection-start-line="6" selection-start-column="25" selection-end-line="6" selection-end-column="25" />
756 761
           <folding />
757 762
         </state>
758 763
       </provider>
759 764
     </entry>
760
-    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_workyear.py">
765
+    <entry file="file://$PROJECT_DIR$/www_zhipin_com/analyse/analyse_salary.py">
761 766
       <provider selected="true" editor-type-id="text-editor">
762
-        <state relative-caret-position="288">
763
-          <caret line="12" column="48" lean-forward="false" selection-start-line="12" selection-start-column="48" selection-end-line="12" selection-end-column="48" />
764
-          <folding>
765
-            <element signature="e#106#131#0" expanded="true" />
766
-          </folding>
767
+        <state relative-caret-position="432">
768
+          <caret line="21" column="0" lean-forward="true" selection-start-line="21" selection-start-column="0" selection-end-line="21" selection-end-column="0" />
769
+          <folding />
767 770
         </state>
768 771
       </provider>
769 772
     </entry>

+ 7 - 2
README.md

@@ -1,3 +1,8 @@
1
-# Scrapy_zhipin
1
+## Scrapy_zhipin
2
+
3
+ 爬取并分析 BOSS直聘 和 拉勾网 关于 PHP 程序员的招聘信息,分析 PHP 程序员需要学习和掌握哪些技能
4
+
5
+2017-12-18 更新
6
+
7
+当过去一段时间,目标网站页面可能修改了,所以爬虫可能不能正常工作,这时就需要你自己手动修改代码了
2 8
 
3
- 爬取并分析 BOSS 直聘关于 PHP 程序员的招聘信息,分析 PHP 程序员需要学习和掌握哪些技能

+ 8 - 4
www_zhipin_com/analyse/analyse_education.py

@@ -13,18 +13,22 @@ db = MongoClient('mongodb://127.0.0.1:27017/').iApp
13 13
 
14 14
 
15 15
 def init():
16
-    items = db.jobs_php.find({})
17
-    lables_arr = [item['education'] for item in items if 'education' in item.keys()]
16
+    item_zhipin = db.jobs_zhipin_php.find({})
17
+    item_lagou = db.jobs_lagou_php.find({})
18
+    lables_arr = [item['education'] for item in item_zhipin if 'education' in item.keys() and item['education']]
19
+    for i in item_lagou:
20
+        if 'education' in i.keys() and i['education']:
21
+            lables_arr.append(i['education'])
18 22
     lables_set = set(lables_arr)
19 23
     out = []
20 24
     for lable in lables_set:
21 25
         out.append((lables_arr.count(lable), lable))
22 26
     out = sorted(out, reverse=True)
23 27
     out_arr = [{'value': x, 'name': y} for (x, y) in out[0:30]]
24
-    if len(out_arr)>30:
28
+    if len(out_arr) > 30:
25 29
         out_arr.append({'value': sum([x for (x, y) in out[30:]]), 'name': 'other'})
26 30
     pprint(out_arr)
27
-    print("基于%d条招聘 (%s更新)" % (len(lables_arr), str(datetime.date.today())))
31
+    print("基于%d条招聘信息 (%s更新)" % (len(lables_arr), str(datetime.date.today())))
28 32
     print([item['name'] for item in out_arr])
29 33
 
30 34
 

+ 13 - 5
www_zhipin_com/analyse/analyse_industry.py

@@ -13,18 +13,26 @@ db = MongoClient('mongodb://127.0.0.1:27017/').iApp
13 13
 
14 14
 
15 15
 def init():
16
-    items = db.jobs_php.find({})
16
+    items = db.jobs_zhipin_php.find({})
17 17
     lables_arr = [item['industryField'] for item in items if 'industryField' in item.keys()]
18
+    lables = lables_arr
19
+    # items = db.jobs_lagou_php.find({})  # 拉钩
20
+    # lables = [item['industryField'].split(',') for item in items if
21
+    #           'industryField' in item.keys() and item['industryField']]
22
+    # lables_arr = []
23
+    # for lable in lables:
24
+    #     lables_arr += [i.strip() for i in lable]
25
+
18 26
     lables_set = set(lables_arr)
19 27
     out = []
20 28
     for lable in lables_set:
21 29
         out.append((lables_arr.count(lable), lable))
22 30
     out = sorted(out, reverse=True)
23
-    out_arr = [{'value': x, 'name': y} for (x, y) in out[0:30]]
24
-    if len(out_arr)>30:
25
-        out_arr.append({'value': sum([x for (x, y) in out[30:]]), 'name': 'other'})
31
+    out_arr = [{'value': x, 'name': y} for (x, y) in out if x >= 10]
32
+    other_sum = sum([x for (x, y) in out if x < 10])
33
+    out_arr.append({'value': other_sum, 'name': 'other'})
26 34
     pprint(out_arr)
27
-    print("基于%d条招聘 (%s更新)" % (len(lables_arr), str(datetime.date.today())))
35
+    print("基于%d条招聘信息 (%s更新)" % (len(lables), str(datetime.date.today())))
28 36
     print([item['name'] for item in out_arr])
29 37
 
30 38
 

+ 98 - 0
www_zhipin_com/analyse/analyse_salary.py

@@ -0,0 +1,98 @@
1
+"""
2
+@author: jtahstu
3
+@contact: root@jtahstu.com
4
+@site: http://www.jtahstu.com
5
+@time: 2017/12/15 13:12
6
+"""
7
+from pprint import pprint
8
+import datetime
9
+import time
10
+from pymongo import MongoClient
11
+
12
+db = MongoClient('mongodb://127.0.0.1:27017/').iApp
13
+
14
+def getAllData():
15
+    items_zhipin = db.jobs_zhipin_php.find()
16
+    items_lagou = db.jobs_lagou_php.find()
17
+    items = mergerData(items_zhipin, items_lagou)
18
+    data = getData(items)
19
+    data.update({'city': '全国'})
20
+    return data
21
+
22
+
23
+def getCityData(city):
24
+    items_zhipin = db.jobs_zhipin_php.find({'city': city})
25
+    items_lagou = db.jobs_lagou_php.find({'city': city})
26
+    items = mergerData(items_zhipin, items_lagou)
27
+    data = getData(items)
28
+    data.update({'city': city})
29
+    return data
30
+
31
+
32
+def mergerData(item_zhipin, item_lagou):
33
+    items = [{'level': i['level'], 'salary': i['salary']} for i in item_zhipin if type(i['salary']) == type({'x': 1})]
34
+    for j in item_lagou:
35
+        if type(j['salary']) == type({'x': 1}):
36
+            items.append({'level': j['level'], 'salary': j['salary']})
37
+    return items
38
+
39
+
40
+def getData(items):
41
+    data = {}
42
+    # lables_arr = [item['salary'] for item in items if 'salary' in item.keys()]
43
+    level_salary = [(x['level'], x['salary']) for x in items]
44
+    levels = set([x[0] for x in level_salary])
45
+    data['x'] = [level_to_workYear(x) for x in levels]
46
+    low_avg = []
47
+    avg_avg = []
48
+    high_avg = []
49
+    count_arr = []
50
+    for level in levels:
51
+        salarys = sorted([i[1]['avg'] for i in level_salary if i[0] == level])
52
+        if len(salarys) > 5:
53
+            e = int(len(salarys) * 0.9) - 1
54
+        else:
55
+            e = len(salarys) - 1
56
+        salarys_arr = [item[1] for item in level_salary if item[0] == level and item[1]['avg'] <= salarys[e]]
57
+        count = len(salarys_arr)
58
+        count_arr.append(count)
59
+        avg_sum = low_sum = high_sum = 0
60
+        for salary in salarys_arr:
61
+            avg_sum += salary['avg']
62
+            low_sum += salary['low']
63
+            high_sum += salary['high']
64
+        avg_avg.append(round(avg_sum / count, 2))
65
+        low_avg.append(round(low_sum / count, 2))
66
+        high_avg.append(round(high_sum / count, 2))
67
+        # print("%s 平均薪资:%.2f 平均最低薪资%.2f 平均最高薪资%.2f\n" % (level_to_workYear(level), avg_avg, low_avg, high_avg))
68
+    data['low_avg'] = low_avg
69
+    data['avg_avg'] = avg_avg
70
+    data['high_avg'] = high_avg
71
+    data['count'] = count_arr
72
+    data['subtext'] = "基于%d条招聘信息 (%s更新)" % (sum(count_arr), str(datetime.date.today()))
73
+    data['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
74
+    data['type'] = 'salary'
75
+    data['status'] = 1
76
+    # pprint(data)
77
+    return data
78
+
79
+
80
+def level_to_workYear(level):
81
+    list = {1: '应届生', 2: '1年以内', 3: '1-3年', 4: '3-5年', 5: '5-10年', 6: '10年以上', 10: '经验不限'}
82
+    return list[level]
83
+
84
+
85
+def init():
86
+    db.jobs_php_analyse.update_many({"type": 'salary'}, {"$set": {'status': 0}})
87
+    citys = db.jobs_zhipin_php.distinct('city')
88
+    for city in citys:
89
+        print(db.jobs_php_analyse.insert_one(dict(getCityData(city))))
90
+        # data = getCityData(city)
91
+        # pprint(data)
92
+    # pprint(getAllData())
93
+    pprint(db.jobs_php_analyse.insert_one(dict(getAllData())))
94
+    print('analyse salary ok')
95
+
96
+
97
+if __name__ == "__main__":
98
+    init()

+ 4 - 3
www_zhipin_com/analyse/analyse_tag.py

@@ -13,8 +13,9 @@ db = MongoClient('mongodb://127.0.0.1:27017/').iApp
13 13
 
14 14
 
15 15
 def init():
16
-    items = db.jobs_php.find({})
17
-    lables = [item['positionLables'] for item in items]
16
+    # items = db.jobs_zhipin_php.find({})
17
+    items = db.jobs_lagou_php.find({})
18
+    lables = [item['positionLables'] for item in items if len(item['positionLables']) >= 1]
18 19
     lables_arr = []
19 20
     for lable in lables:
20 21
         lables_arr += lable
@@ -28,7 +29,7 @@ def init():
28 29
     out_arr.append({'value': sum([x for (x, y) in out[30:]]), 'name': 'other'})
29 30
     pprint(out_arr)
30 31
     print(out[30:80])
31
-    print("基于%d条招聘的%d个标签 (排除%d个php标签,%s更新)" % (len(lables), len(lables_arr),tag_php[0], str(datetime.date.today())))
32
+    print("基于%d条招聘的%d个标签 (排除%d个php标签,%s更新)" % (len(lables), len(lables_arr), tag_php[0], str(datetime.date.today())))
32 33
 
33 34
 
34 35
 if __name__ == "__main__":

+ 13 - 0
www_zhipin_com/analyse/analyse_tool.py

@@ -0,0 +1,13 @@
1
+"""
2
+@author: jtahstu
3
+@contact: root@jtahstu.com
4
+@site: http://www.jtahstu.com
5
+@time: 2017/12/16 16:34
6
+"""
7
+from pymongo import MongoClient
8
+
9
+db = MongoClient('mongodb://127.0.0.1:27017/').iApp
10
+
11
+
12
+def insert_php_analyse(data):
13
+    return db.jobs_php_analyse.insert_one(dict(data))

+ 72 - 0
www_zhipin_com/analyse/analyse_workyear.py

@@ -0,0 +1,72 @@
1
+"""
2
+@author: jtahstu
3
+@contact: root@jtahstu.com
4
+@site: http://www.jtahstu.com
5
+@time: 2017/12/15 13:12
6
+"""
7
+from pprint import pprint
8
+import datetime
9
+import time
10
+from pymongo import MongoClient
11
+
12
+db = MongoClient('mongodb://127.0.0.1:27017/').iApp
13
+
14
+
15
+def getAllData():
16
+    items_zhipin = db.jobs_zhipin_php.find()
17
+    items_lagou = db.jobs_lagou_php.find()
18
+    items = mergerData(items_zhipin, items_lagou)
19
+    data = getData(items)
20
+    data.update({'city': '全国'})
21
+    return data
22
+
23
+
24
+def getCityData(city):
25
+    items_zhipin = db.jobs_zhipin_php.find({'city': city})
26
+    items_lagou = db.jobs_lagou_php.find({'city': city})
27
+    items = mergerData(items_zhipin, items_lagou)
28
+    data = getData(items)
29
+    data.update({'city': city})
30
+    return data
31
+
32
+
33
+def mergerData(item_zhipin, item_lagou):
34
+    items = [{'workYear': i['workYear']} for i in item_zhipin]
35
+    for j in item_lagou:
36
+        items.append({'workYear': j['workYear']})
37
+    return items
38
+
39
+
40
+def getData(items):
41
+    data = {}
42
+    lables_arr = [item['workYear'] for item in items if 'workYear' in item.keys()]
43
+    lables_set = set(lables_arr)
44
+    out = []
45
+    for lable in lables_set:
46
+        out.append((lables_arr.count(lable), lable))
47
+    out = sorted(out, reverse=True)
48
+    out_arr = [{'value': x, 'name': y} for (x, y) in out[0:30]]
49
+    if len(out_arr) > 30:
50
+        out_arr.append({'value': sum([x for (x, y) in out[30:]]), 'name': 'other'})
51
+    data['data'] = out_arr
52
+    data['subtext'] = "基于%d条招聘信息 (%s更新)" % (len(lables_arr), str(datetime.date.today()))
53
+    data['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
54
+    data['type'] = 'workyear'
55
+    data['status'] = 1
56
+    # pprint(data)
57
+    return data
58
+
59
+
60
+def init():
61
+    db.jobs_php_analyse.update_many({"type": 'workyear'}, {"$set": {'status': 0}})
62
+    citys = db.jobs_zhipin_php.distinct('city')
63
+    for city in citys:
64
+        data = getCityData(city)
65
+        print(db.jobs_php_analyse.insert_one(dict(data)))
66
+        # pprint(data)
67
+    pprint(db.jobs_php_analyse.insert_one(dict(getAllData())))
68
+    print('analyse workyear ok')
69
+
70
+
71
+if __name__ == "__main__":
72
+    init()

+ 44 - 51
www_zhipin_com/analyse/clear_data.py

@@ -4,13 +4,17 @@
4 4
 @site: http://www.jtahstu.com
5 5
 @time: 2017/12/12 10:54
6 6
 """
7
+
8
+# -*- coding: utf-8 -*-
9
+
7 10
 import datetime
8 11
 import json
9 12
 
10 13
 from pymongo import MongoClient
11 14
 from pprint import pprint
12
-import jieba
13
-import jieba.posseg as psg
15
+
16
+# import jieba
17
+# import jieba.posseg as psg
14 18
 
15 19
 db = MongoClient('127.0.0.1', 27017).iApp
16 20
 
@@ -24,23 +28,11 @@ def find_detail(pid):
24 28
 
25 29
 
26 30
 def update(data):
27
-    return db.jobs_php.update_one({"_id": data['_id']}, {"$set": data})
31
+    return db.jobs_zhipin_php.update_one({"_id": data['_id']}, {"$set": data})
28 32
 
29 33
 
30
-# 把 detail 合并到 jobs_php 中
31
-def merge_detail():
32
-    items = db.jobs_php.find({})
33
-    for item in items:
34
-        if 'detail' in item.keys():
35
-            continue
36
-        detail = find_detail(item['pid'])
37
-        if detail:
38
-            item['detail'] = detail['detail']
39
-            item['location'] = detail['location']
40
-            update(item)
41
-        else:
42
-            print(item['pid'] + " 没有 detail")
43
-    print('ok')
34
+def update_lagou(data):
35
+    return db.jobs_lagou_php.update_one({"_id": data['_id']}, {"$set": data})
44 36
 
45 37
 
46 38
 # 把时间校正过来
@@ -62,12 +54,19 @@ def clear_time():
62 54
 
63 55
 # 薪水处理成数字
64 56
 def clear_salary():
65
-    items = db.jobs_php.find({})
57
+    items = db.jobs_lagou_php.find({})
66 58
     for item in items:
67 59
         if type(item['salary']) == type({}):
68 60
             continue
69
-        salary_list = item['salary'].replace("K", "000").split("-")
70
-        salary_list = [int(x) for x in salary_list]
61
+        salary_list = item['salary'].lower().replace("k", "000").split("-")
62
+        if len(salary_list) != 2:
63
+            print(salary_list)
64
+            continue
65
+        try:
66
+            salary_list = [int(x) for x in salary_list]
67
+        except:
68
+            print(salary_list)
69
+            continue
71 70
         item['salary'] = {
72 71
             'low': salary_list[0],
73 72
             'high': salary_list[1],
@@ -78,15 +77,23 @@ def clear_salary():
78 77
 
79 78
 
80 79
 def work_year_group():
81
-    items = db.jobs_php.find({}, {"workYear": 1})
82
-    items_value = set([x["workYear"] for x in items])
83
-    print(items_value)
80
+    # items = db.jobs_lagou_php.find({}, {"workYear": 1})
81
+    # items_value = set([x["workYear"] for x in items])
82
+    # print(items_value)
83
+    print(db.jobs_lagou_php.distinct('workYear'))
84 84
     # {'1-3年', '5-10年', '3-5年', '1年以内', '经验不限', '应届生'}
85
+    # "应届毕业生",
86
+    # "1-3年",
87
+    # "3-5年",
88
+    # "5-10年",
89
+    # "不限",
90
+    # "1年以下",
91
+    # "10年以上"
85 92
 
86 93
 
87 94
 # 设置招聘的水平
88 95
 def set_level():
89
-    items = db.jobs_php.find({})
96
+    items = db.jobs_zhipin_php.find({})
90 97
     for item in items:
91 98
         if item['workYear'] == '应届生':
92 99
             item['level'] = 1
@@ -106,28 +113,17 @@ def set_level():
106 113
     print('ok')
107 114
 
108 115
 
109
-def level_avg_salary():
110
-    items = db.jobs_php.find()
111
-    items = [item for item in items]
112
-    level_salary = [(x['level'], x['salary'], x['workYear']) for x in items]
113
-    levels = set([x[0] for x in level_salary])
114
-    # levels = [1]
115
-    for level in levels:
116
-        count = len([x[0] for x in level_salary if x[0] == level])
117
-        print("统计数量 : %d" % count)
118
-        salary_arr = [x[1] for x in level_salary if x[0] == level]
119
-        avg_sum = low_sum = high_sum = 0
120
-        for salary in salary_arr:
121
-            avg_sum += salary['avg']
122
-            low_sum += salary['low']
123
-            high_sum += salary['high']
124
-        avg_avg = avg_sum / count
125
-        low_avg = low_sum / count
126
-        high_avg = high_sum / count
127
-        print("%s 平均薪资:%.2f 平均最低薪资%.2f 平均最高薪资%.2f\n" % (level_to_workYear(level), avg_avg, low_avg, high_avg))
128
-
129
-    print(levels)
130
-    print('level_avg_salary is ok')
116
+def update_lagou_workyear():
117
+    items = db.jobs_lagou_php.find({})
118
+    for item in items:
119
+        if item['workYear'] == '应届毕业生':
120
+            item['workYear'] = '应届生'
121
+        elif item['workYear'] == '1年以下':
122
+            item['workYear'] = '1年以内'
123
+        elif item['workYear'] == '不限':
124
+            item['workYear'] = '经验不限'
125
+        update_lagou(item)
126
+    print('ok')
131 127
 
132 128
 
133 129
 def level_to_workYear(level):
@@ -172,15 +168,12 @@ def jieba_cut_test():
172 168
 
173 169
 
174 170
 def init():
175
-    # merge_detail()
176 171
     # clear_time()
177 172
     # clear_salary()
178 173
 
179
-    # work_year_group()
180 174
     # set_level()
181
-
182
-    jieba_cut_test()
183
-    # level_avg_salary()
175
+    update_lagou_workyear()
176
+    # jieba_cut_test()
184 177
 
185 178
 
186 179
 if __name__ == "__main__":

+ 15 - 3
www_zhipin_com/analyse/tool.py

@@ -6,13 +6,25 @@
6 6
 """
7 7
 from pymongo import MongoClient
8 8
 
9
-conn = MongoClient('mongodb://127.0.0.1:27017/')
10
-db = conn.iApp
9
+db = MongoClient('mongodb://127.0.0.1:27017/').iApp
11 10
 
12 11
 
13 12
 def init():
14
-    print(len([item['pid'] for item in db.jobs_php.find().sort('pid') if 'detail' in item.keys()]))
13
+    # find_duplicate()
14
+    # print(len([item['pid'] for item in db.jobs_php.find() if 'detail' in item.keys()]))
15
+    print(db.jobs_lagou_php.find_one({'positionId', 3936381}))
15 16
 
17
+def find_duplicate():
18
+    items = db.jobs_java.find().sort('pid')
19
+    pid_arr = [item['pid'] for item in items]
20
+    print(len(pid_arr))
21
+    pid_set = set(pid_arr)
22
+    print(len(pid_set))
23
+    x = []
24
+    for i in pid_set:
25
+        if pid_arr.count(i) > 1:
26
+            x.append(i)
27
+    print(x)
16 28
 
17 29
 if __name__ == "__main__":
18 30
     init()

File diff suppressed because it is too large
+ 156010 - 0
www_zhipin_com/data/jobs_lagou_php.json


+ 11 - 3
www_zhipin_com/spiders/detail.py

@@ -17,19 +17,25 @@ headers = {
17 17
     'Accept-Encoding': 'gzip, deflate, br',
18 18
     'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
19 19
     'Connection': 'keep-alive',
20
-    'Cookie': '__c=1512975768; __g=-; __jsluid=1f48e9bb513264f042ebc8fd259b3473; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fjob_detail%2F1416202413.html%3Fka%3Dsearch_list_4_blank; lastCity=101020100; JSESSIONID=""; __l=l=%2Fc101020100%2Fh_101020100%2F%3Fquery%3Dphp%26page%3D1%26ka%3Dpage-1&r=; __a=11247336.1512806029.1512806029.1512975768.109.4.39.109; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1512806030,1512975768; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1513303794',
20
+    'Cookie': '__c=1513350136; __l=r=&l=%2F; lastCity=101020100; __jsluid=1e3e2beae73bfca4c89fd38e00d8524a; JSESSIONID=""; __g=-; __a=41702256.1513350136..1513350136.2.1.2.2; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1513350139; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1513350144',
21 21
     'DNT': '1',
22 22
     'Host': 'www.zhipin.com',
23 23
     'Referer': 'https://www.zhipin.com/',
24 24
     'Upgrade-Insecure-Requests': '1',
25
-    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
25
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3202.94 Safari/537.36'
26 26
 }
27
+
28
+# proxies = {
29
+#     'http': 'http://121.232.145.168:9000',
30
+#     'https': 'https://119.122.214.153:9000',
31
+# }
32
+
27 33
 conn = MongoClient('mongodb://127.0.0.1:27017/')
28 34
 db = conn.iApp
29 35
 
30 36
 
31 37
 def init():
32
-    items = db.jobs_php.find().sort('pid')
38
+    items = db.jobs_php.find()
33 39
     for item in items:
34 40
         if 'detail' in item.keys():
35 41
             continue
@@ -53,9 +59,11 @@ def init():
53 59
         print(res)
54 60
         sleep()
55 61
 
62
+
56 63
 def sleep():
57 64
     time.sleep(int(random.uniform(50, 70)))
58 65
 
66
+
59 67
 def save(item):
60 68
     return db.jobs_php.update_one({"_id": item['_id']}, {"$set": item})
61 69
 

File diff suppressed because it is too large
+ 86 - 0
www_zhipin_com/spiders/lagou_list.py