日志统一用中文

tangwang
1 parent 506c39b7
Showing 6 changed files with 112 additions and 106 deletions Show diff stats
data/wanbang/amazon_crawler.py
data/wanbang/analyze_results.py
docs/常用查询 - sql.sql
scripts/check_es_data.py
scripts/monitor_eviction.py
start_reranker.sh
@@ -85,7 +85,7 @@ class AmazonCrawler:
                 params[param] = kwargs[param]
  
         try:
-            logger.info(f"正在请求: {query}")
+            logger.info(f"Making request: {query}")
             self.total_requests += 1
  
             response = requests.get(
@@ -98,20 +98,20 @@ class AmazonCrawler:
             data = response.json()
  
             if data.get('error_code') == '0000':
-                logger.info(f"✓ 成功: {query} - 获得 {data.get('items', {}).get('real_total_results', 0)} 个结果")
+                logger.info(f"✓ Success: {query} - Got {data.get('items', {}).get('real_total_results', 0)} results")
                 self.successful_requests += 1
                 return data
             else:
-                logger.error(f"✗ API错误: {query} - {data.get('reason', 'Unknown error')}")
+                logger.error(f"✗ API error: {query} - {data.get('reason', 'Unknown error')}")
                 self.failed_requests += 1
                 return data
  
         except requests.exceptions.RequestException as e:
-            logger.error(f"✗ 请求失败: {query} - {str(e)}")
+            logger.error(f"✗ Request failed: {query} - {str(e)}")
             self.failed_requests += 1
             return None
         except json.JSONDecodeError as e:
-            logger.error(f"✗ JSON解析失败: {query} - {str(e)}")
+            logger.error(f"✗ JSON parse failed: {query} - {str(e)}")
             self.failed_requests += 1
             return None
  
@@ -127,36 +127,36 @@ class AmazonCrawler:
         try:
             with open(filepath, 'w', encoding='utf-8') as f:
                 json.dump(data, f, ensure_ascii=False, indent=2)
-            logger.debug(f"已保存: {filename}")
+            logger.debug(f"Saved: {filename}")
         except Exception as e:
-            logger.error(f"保存失败: {filename} - {str(e)}")
+            logger.error(f"Save failed: {filename} - {str(e)}")
  
     def crawl_from_file(self, queries_file: str, delay: float = 1.0, 
                        start_index: int = 0, max_queries: Optional[int] = None):
         """从文件读取查询列表并批量爬取"""
         self.start_time = datetime.now()
         logger.info("=" * 70)
-        logger.info(f"Amazon爬虫启动 - {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
+        logger.info(f"Amazon crawler started - {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
         logger.info("=" * 70)
-        logger.info(f"查询文件: {queries_file}")
-        logger.info(f"结果目录: {self.results_dir}")
+        logger.info(f"Queries file: {queries_file}")
+        logger.info(f"Results directory: {self.results_dir}")
  
         try:
             with open(queries_file, 'r', encoding='utf-8') as f:
                 queries = [line.strip() for line in f if line.strip()]
  
             total_queries = len(queries)
-            logger.info(f"共读取 {total_queries} 个查询")
+            logger.info(f"Total queries read: {total_queries}")
  
             if start_index > 0:
                 queries = queries[start_index:]
-                logger.info(f"从索引 {start_index} 开始")
+                logger.info(f"Starting from index {start_index}")
  
             if max_queries:
                 queries = queries[:max_queries]
-                logger.info(f"限制爬取数量: {max_queries}")
+                logger.info(f"Limit crawl count to: {max_queries}")
  
-            logger.info(f"请求间隔: {delay} 秒")
+            logger.info(f"Request interval: {delay} seconds")
             logger.info("=" * 70)
  
             # 逐个爬取
@@ -189,24 +189,24 @@ class AmazonCrawler:
             duration = end_time - self.start_time
  
             logger.info("=" * 70)
-            logger.info("爬取完成！")
+            logger.info("Crawling completed!")
             logger.info("=" * 70)
-            logger.info(f"开始时间: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
-            logger.info(f"结束时间: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
-            logger.info(f"总耗时: {duration}")
-            logger.info(f"总请求数: {self.total_requests}")
-            logger.info(f"成功: {self.successful_requests} ({self.successful_requests/self.total_requests*100:.1f}%)")
-            logger.info(f"失败: {self.failed_requests} ({self.failed_requests/self.total_requests*100:.1f}%)")
-            logger.info(f"结果保存在: {self.results_dir.absolute()}")
+            logger.info(f"Start time: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
+            logger.info(f"End time: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
+            logger.info(f"Total duration: {duration}")
+            logger.info(f"Total requests: {self.total_requests}")
+            logger.info(f"Successful: {self.successful_requests} ({self.successful_requests/self.total_requests*100:.1f}%)")
+            logger.info(f"Failed: {self.failed_requests} ({self.failed_requests/self.total_requests*100:.1f}%)")
+            logger.info(f"Results saved to: {self.results_dir.absolute()}")
             logger.info("=" * 70)
  
         except FileNotFoundError:
-            logger.error(f"文件不存在: {queries_file}")
+            logger.error(f"File not found: {queries_file}")
         except KeyboardInterrupt:
-            logger.warning("\n用户中断爬取")
-            logger.info(f"已完成: {self.successful_requests}/{self.total_requests}")
+            logger.warning("\nUser interrupted crawling")
+            logger.info(f"Completed: {self.successful_requests}/{self.total_requests}")
         except Exception as e:
-            logger.error(f"爬取过程出错: {str(e)}", exc_info=True)
+            logger.error(f"Error during crawling: {str(e)}", exc_info=True)
  
  
 def load_config():
@@ -216,7 +216,7 @@ def load_config():
         import config
         return config
     except ImportError:
-        logger.warning("未找到配置文件 config.py，使用默认配置")
+        logger.warning("Config file config.py not found, using default configuration")
         return None
  
  
@@ -258,12 +258,12 @@ def main():
     if not api_key or not api_secret or \
        api_key == "your_api_key_here" or api_secret == "your_api_secret_here":
         logger.error("=" * 70)
-        logger.error("错误: 未配置API密钥！")
+        logger.error("Error: API key not configured!")
         logger.error("")
-        logger.error("请使用以下任一方式配置API密钥：")
-        logger.error("1. 命令行参数: --key YOUR_KEY --secret YOUR_SECRET")
-        logger.error("2. 配置文件: 复制 config.example.py 为 config.py 并填入密钥")
-        logger.error("3. 环境变量: ONEBOUND_API_KEY 和 ONEBOUND_API_SECRET")
+        logger.error("Please configure API key using one of the following methods:")
+        logger.error("1. Command line arguments: --key YOUR_KEY --secret YOUR_SECRET")
+        logger.error("2. Config file: Copy config.example.py to config.py and fill in the keys")
+        logger.error("3. Environment variables: ONEBOUND_API_KEY and ONEBOUND_API_SECRET")
         logger.error("=" * 70)
         return
  
@@ -30,22 +30,22 @@ class ResultAnalyzer:
         """
         self.results_dir = Path(results_dir)
         if not self.results_dir.exists():
-            logger.error(f"结果目录不存在: {self.results_dir}")
+            logger.error(f"Results directory does not exist: {self.results_dir}")
             raise FileNotFoundError(f"Directory not found: {self.results_dir}")
  
     def analyze(self):
         """执行完整分析"""
         logger.info("=" * 70)
-        logger.info("Amazon爬取结果分析")
+        logger.info("Amazon crawling result analysis")
         logger.info("=" * 70)
-        logger.info(f"结果目录: {self.results_dir.absolute()}")
+        logger.info(f"Results directory: {self.results_dir.absolute()}")
  
         # 获取所有JSON文件
         json_files = list(self.results_dir.glob("*.json"))
-        logger.info(f"JSON文件数量: {len(json_files)}")
+        logger.info(f"Number of JSON files: {len(json_files)}")
  
         if not json_files:
-            logger.warning("未找到任何JSON文件")
+            logger.warning("No JSON files found")
             return
  
         # 统计数据
@@ -62,7 +62,7 @@ class ResultAnalyzer:
         }
  
         # 分析每个文件
-        logger.info("\n正在分析文件...")
+        logger.info("\nAnalyzing files...")
         for json_file in json_files:
             try:
                 with open(json_file, 'r', encoding='utf-8') as f:
@@ -118,7 +118,7 @@ class ResultAnalyzer:
                     stats['failed'] += 1
  
             except Exception as e:
-                logger.error(f"分析文件失败 {json_file.name}: {str(e)}")
+                logger.error(f"Failed to analyze file {json_file.name}: {str(e)}")
                 stats['failed'] += 1
  
         # 输出统计结果
@@ -130,29 +130,29 @@ class ResultAnalyzer:
     def print_stats(self, stats: Dict):
         """打印统计信息"""
         logger.info("\n" + "=" * 70)
-        logger.info("统计结果")
+        logger.info("Statistics results")
         logger.info("=" * 70)
  
         # 基本统计
-        logger.info(f"\n【文件统计】")
-        logger.info(f"总文件数: {stats['total_files']}")
-        logger.info(f"成功: {stats['successful']} ({stats['successful']/stats['total_files']*100:.1f}%)")
-        logger.info(f"失败: {stats['failed']} ({stats['failed']/stats['total_files']*100:.1f}%)")
-        
+        logger.info(f"\n[File Statistics]")
+        logger.info(f"Total files: {stats['total_files']}")
+        logger.info(f"Successful: {stats['successful']} ({stats['successful']/stats['total_files']*100:.1f}%)")
+        logger.info(f"Failed: {stats['failed']} ({stats['failed']/stats['total_files']*100:.1f}%)")
+
         # 商品统计
-        logger.info(f"\n【商品统计】")
-        logger.info(f"总商品数: {stats['total_items']}")
+        logger.info(f"\n[Product Statistics]")
+        logger.info(f"Total products: {stats['total_items']}")
         if stats['items_per_query']:
             avg_items = sum(stats['items_per_query']) / len(stats['items_per_query'])
             max_items = max(stats['items_per_query'])
             min_items = min(stats['items_per_query'])
-            logger.info(f"平均每个查询: {avg_items:.1f} 个商品")
-            logger.info(f"最多: {max_items} 个")
-            logger.info(f"最少: {min_items} 个")
+            logger.info(f"Average per query: {avg_items:.1f} products")
+            logger.info(f"Maximum: {max_items} products")
+            logger.info(f"Minimum: {min_items} products")
  
         # 价格分布
         if stats['price_ranges']:
-            logger.info(f"\n【价格分布】")
+            logger.info(f"\n[Price Distribution]")
             total_priced = sum(stats['price_ranges'].values())
             for price_range, count in sorted(stats['price_ranges'].items()):
                 percentage = count / total_priced * 100
@@ -162,15 +162,15 @@ class ResultAnalyzer:
         if stats['avg_reviews']:
             avg_reviews = sum(stats['avg_reviews']) / len(stats['avg_reviews'])
             max_reviews = max(stats['avg_reviews'])
-            logger.info(f"\n【评论统计】")
-            logger.info(f"平均评论数: {avg_reviews:.0f}")
-            logger.info(f"最高评论数: {max_reviews}")
+            logger.info(f"\n[Review Statistics]")
+            logger.info(f"Average reviews: {avg_reviews:.0f}")
+            logger.info(f"Highest reviews: {max_reviews}")
  
         # 评分统计
         if stats['avg_stars']:
             avg_stars = sum(stats['avg_stars']) / len(stats['avg_stars'])
-            logger.info(f"\n【评分统计】")
-            logger.info(f"平均评分: {avg_stars:.2f}")
+            logger.info(f"\n[Rating Statistics]")
+            logger.info(f"Average rating: {avg_stars:.2f}")
  
         logger.info("\n" + "=" * 70)
  
@@ -204,9 +204,9 @@ class ResultAnalyzer:
         try:
             with open(report_file, 'w', encoding='utf-8') as f:
                 json.dump(report, f, ensure_ascii=False, indent=2)
-            logger.info(f"分析报告已保存: {report_file}")
+            logger.info(f"Analysis report saved: {report_file}")
         except Exception as e:
-            logger.error(f"保存报告失败: {str(e)}")
+            logger.error(f"Failed to save report: {str(e)}")
  
     def export_csv(self, output_file: str = None):
         """导出为CSV格式"""
@@ -215,7 +215,7 @@ class ResultAnalyzer:
         if output_file is None:
             output_file = self.results_dir / "items_export.csv"
  
-        logger.info(f"\n导出CSV: {output_file}")
+        logger.info(f"\nExporting to CSV: {output_file}")
  
         json_files = list(self.results_dir.glob("*.json"))
  
@@ -243,9 +243,9 @@ class ResultAnalyzer:
                                 item.get('detail_url', '')
                             ])
                 except Exception as e:
-                    logger.error(f"导出失败 {json_file.name}: {str(e)}")
+                    logger.error(f"Export failed for {json_file.name}: {str(e)}")
  
-        logger.info(f"CSV导出完成: {output_file}")
+        logger.info(f"CSV export completed: {output_file}")
  
  
 def main():
@@ -270,7 +270,7 @@ def main():
             analyzer.export_csv(args.output)
  
     except Exception as e:
-        logger.error(f"分析失败: {str(e)}")
+        logger.error(f"Analysis failed: {str(e)}")
  
  
 if __name__ == "__main__":
 -- 查询今天入库的SPU和SKU商品数据
 -- 用于查询当天新增的商品信息
  
+
+select id, title from shoplazza_product_spu where tenant_id = 170 and deleted = 0 ;
+
 -- ======================================
 -- 1. 查询今天入库的SPU商品
 -- ======================================
 #!/usr/bin/env python3
 """
-检查ES索引中的实际数据，看分面字段是否有值
+Check actual data in ES index to see if facet fields have values
 """
  
 import sys
@@ -14,9 +14,9 @@ from utils.es_client import ESClient
  
  
 def check_es_facet_fields(es_client, tenant_id: str, size: int = 5):
-    """检查ES中的分面相关字段"""
+    """Check facet-related fields in ES"""
     print("\n" + "="*60)
-    print("检查ES索引中的分面字段数据")
+    print("Checking facet field data in ES index")
     print("="*60)
  
     query = {
@@ -46,14 +46,14 @@ def check_es_facet_fields(es_client, tenant_id: str, size: int = 5):
         hits = response.get('hits', {}).get('hits', [])
         total = response.get('hits', {}).get('total', {}).get('value', 0)
  
-        print(f"\n总文档数: {total}")
-        print(f"检查前 {len(hits)} 个文档:\n")
+        print(f"\nTotal documents: {total}")
+        print(f"Checking first {len(hits)} documents:\n")
  
         for i, hit in enumerate(hits, 1):
             source = hit.get('_source', {})
             title_obj = source.get("title") or {}
             category_path_obj = source.get("category_path") or {}
-            print(f"文档 {i}:")
+            print(f"Document {i}:")
             print(f"  spu_id: {source.get('spu_id')}")
             print(f"  title.zh: {str(title_obj.get('zh', ''))[:50] if isinstance(title_obj, dict) else ''}")
             print(f"  category1_name: {source.get('category1_name')}")
@@ -67,24 +67,24 @@ def check_es_facet_fields(es_client, tenant_id: str, size: int = 5):
  
             specs = source.get('specifications', [])
             if specs:
-                print(f"  specifications 数量: {len(specs)}")
+                print(f"  specifications count: {len(specs)}")
                 # 显示前3个specifications
                 for spec in specs[:3]:
                     print(f"    - name: {spec.get('name')}, value: {spec.get('value')}")
             else:
-                print(f"  specifications: 空")
+                print(f"  specifications: empty")
             print()
  
     except Exception as e:
-        print(f"错误: {e}")
+        print(f"Error: {e}")
         import traceback
         traceback.print_exc()
  
  
 def check_facet_aggregations(es_client, tenant_id: str):
-    """检查分面聚合查询"""
+    """Check facet aggregation queries"""
     print("\n" + "="*60)
-    print("检查分面聚合查询结果")
+    print("Checking facet aggregation query results")
     print("="*60)
  
     query = {
@@ -174,16 +174,16 @@ def check_facet_aggregations(es_client, tenant_id: str):
         response = es_client.client.search(index="search_products", body=query)
         aggs = response.get('aggregations', {})
  
-        print("\n1. category1_name 分面:")
+        print("\n1. category1_name facet:")
         category1 = aggs.get('category1_facet', {})
         buckets = category1.get('buckets', [])
         if buckets:
             for bucket in buckets:
                 print(f"  {bucket['key']}: {bucket['doc_count']}")
         else:
-            print("  空（没有数据）")
+            print("  empty (no data)")
  
-        print("\n2. specifications.color 分面:")
+        print("\n2. specifications.color facet:")
         color_agg = aggs.get('color_facet', {})
         color_filter = color_agg.get('filter_by_name', {})
         color_values = color_filter.get('value_counts', {})
@@ -192,9 +192,9 @@ def check_facet_aggregations(es_client, tenant_id: str):
             for bucket in color_buckets:
                 print(f"  {bucket['key']}: {bucket['doc_count']}")
         else:
-            print("  空（没有数据）")
+            print("  empty (no data)")
  
-        print("\n3. specifications.size 分面:")
+        print("\n3. specifications.size facet:")
         size_agg = aggs.get('size_facet', {})
         size_filter = size_agg.get('filter_by_name', {})
         size_values = size_filter.get('value_counts', {})
@@ -203,9 +203,9 @@ def check_facet_aggregations(es_client, tenant_id: str):
             for bucket in size_buckets:
                 print(f"  {bucket['key']}: {bucket['doc_count']}")
         else:
-            print("  空（没有数据）")
+            print("  empty (no data)")
  
-        print("\n4. specifications.material 分面:")
+        print("\n4. specifications.material facet:")
         material_agg = aggs.get('material_facet', {})
         material_filter = material_agg.get('filter_by_name', {})
         material_values = material_filter.get('value_counts', {})
@@ -214,19 +214,19 @@ def check_facet_aggregations(es_client, tenant_id: str):
             for bucket in material_buckets:
                 print(f"  {bucket['key']}: {bucket['doc_count']}")
         else:
-            print("  空（没有数据）")
+            print("  empty (no data)")
  
     except Exception as e:
-        print(f"错误: {e}")
+        print(f"Error: {e}")
         import traceback
         traceback.print_exc()
  
  
 def main():
-    parser = argparse.ArgumentParser(description='检查ES索引中的分面字段数据')
+    parser = argparse.ArgumentParser(description='Check facet field data in ES index')
     parser.add_argument('--tenant-id', required=True, help='Tenant ID')
-    parser.add_argument('--es-host', help='Elasticsearch host (或使用环境变量 ES_HOST, 默认: http://localhost:9200)')
-    parser.add_argument('--size', type=int, default=5, help='检查的文档数量 (默认: 5)')
+    parser.add_argument('--es-host', help='Elasticsearch host (or use env var ES_HOST, default: http://localhost:9200)')
+    parser.add_argument('--size', type=int, default=5, help='Number of documents to check (default: 5)')
  
     args = parser.parse_args()
  
@@ -235,7 +235,7 @@ def main():
     es_username = os.environ.get('ES_USERNAME')
     es_password = os.environ.get('ES_PASSWORD')
  
-    print(f"连接Elasticsearch: {es_host}")
+    print(f"Connecting to Elasticsearch: {es_host}")
     print(f"Tenant ID: {args.tenant_id}\n")
  
     try:
@@ -245,11 +245,11 @@ def main():
             es_client = ESClient(hosts=[es_host])
  
         if not es_client.ping():
-            print(f"✗ 无法连接到Elasticsearch: {es_host}")
+            print(f"✗ Cannot connect to Elasticsearch: {es_host}")
             return 1
-        print("✓ Elasticsearch连接成功\n")
+        print("✓ Elasticsearch connected successfully\n")
     except Exception as e:
-        print(f"✗ 连接Elasticsearch失败: {e}")
+        print(f"✗ Failed to connect to Elasticsearch: {e}")
         return 1
  
     # 检查ES数据
@@ -257,7 +257,7 @@ def main():
     check_facet_aggregations(es_client, args.tenant_id)
  
     print("\n" + "="*60)
-    print("检查完成")
+    print("Check completed")
     print("="*60)
  
     return 0
 #!/usr/bin/env python3
 """
-实时监控 Redis 缓存淘汰事件
+Real-time monitoring of Redis cache eviction events
  
-持续监控 evicted_keys 统计，当有新的淘汰发生时发出警告
+Continuously monitor evicted_keys statistics and warn when new evictions occur
 """
  
 import redis
@@ -18,7 +18,7 @@ sys.path.insert(0, str(project_root))
 from config.env_config import REDIS_CONFIG
  
 def get_redis_client():
-    """获取 Redis 客户端"""
+    """Get Redis client"""
     return redis.Redis(
         host=REDIS_CONFIG.get('host', 'localhost'),
         port=REDIS_CONFIG.get('port', 6479),
@@ -29,12 +29,12 @@ def get_redis_client():
     )
  
 def monitor_eviction(interval=5):
-    """持续监控淘汰事件"""
+    """Continuously monitor eviction events"""
     print("=" * 60)
-    print("Redis 缓存淘汰实时监控")
+    print("Redis Cache Eviction Real-time Monitoring")
     print("=" * 60)
-    print(f"监控间隔: {interval} 秒")
-    print("按 Ctrl+C 停止监控")
+    print(f"Monitoring interval: {interval} seconds")
+    print("Press Ctrl+C to stop monitoring")
     print("=" * 60)
     print()
  
@@ -42,7 +42,7 @@ def monitor_eviction(interval=5):
         client = get_redis_client()
         client.ping()
     except Exception as e:
-        print(f"❌ Redis 连接失败: {e}")
+        print(f"❌ Redis connection failed: {e}")
         return
  
     last_evicted = 0
@@ -55,8 +55,8 @@ def monitor_eviction(interval=5):
             if current_evicted > last_evicted:
                 new_evictions = current_evicted - last_evicted
                 timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-                print(f"[{timestamp}] ⚠️  检测到 {new_evictions} 个新的淘汰事件！")
-                print(f"    累计淘汰总数: {current_evicted:,}")
+                print(f"[{timestamp}] ⚠️  Detected {new_evictions} new eviction events!")
+                print(f"    Total evictions: {current_evicted:,}")
  
                 # 检查内存使用情况
                 mem_info = client.info('memory')
@@ -64,26 +64,26 @@ def monitor_eviction(interval=5):
                 used_memory = mem_info.get('used_memory', 0)
                 if maxmemory > 0:
                     usage_percent = (used_memory / maxmemory) * 100
-                    print(f"    当前内存使用率: {usage_percent:.2f}%")
+                    print(f"    Current memory usage: {usage_percent:.2f}%")
  
                 last_evicted = current_evicted
             else:
                 timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-                print(f"[{timestamp}] ✅ 无新淘汰事件 (累计: {current_evicted:,})")
+                print(f"[{timestamp}] ✅ No new eviction events (Total: {current_evicted:,})")
  
             time.sleep(interval)
  
     except KeyboardInterrupt:
-        print("\n\n监控已停止")
+        print("\n\nMonitoring stopped")
     except Exception as e:
-        print(f"\n❌ 监控出错: {e}")
+        print(f"\n❌ Monitoring error: {e}")
         import traceback
         traceback.print_exc()
  
 if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(description='实时监控 Redis 缓存淘汰事件')
-    parser.add_argument('--interval', type=int, default=5, help='监控间隔（秒），默认 5 秒')
+    parser = argparse.ArgumentParser(description='Real-time monitoring of Redis cache eviction events')
+    parser.add_argument('--interval', type=int, default=5, help='Monitoring interval in seconds (default: 5)')
     args = parser.parse_args()
  
     monitor_eviction(interval=args.interval)
@@ -0,0 +1,3 @@
+
+uvicorn reranker.server:app --host 0.0.0.0 --port 6007
+